diff --git a/.clang-format b/.clang-format
index 1d2ad9a77f..b50c1facfb 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,27 +1,71 @@
-BasedOnStyle : google
+BasedOnStyle : LLVM
+# Indent formatting
 IndentWidth : 2
-BreakBeforeBraces : Linux
+Language: Cpp
+UseTab: Never
 KeepEmptyLinesAtTheStartOfBlocks : true
 MaxEmptyLinesToKeep : 2
 AccessModifierOffset : -2
-UseTab: Never
+# This must be off so that include order in RAJA is preserved
+SortIncludes: false
+
+# Alignment of consecutive declarations, assignments etc
+AlignConsecutiveAssignments : true
+AlignConsecutiveDeclarations : false
+AlignConsecutiveMacros : true
+AlignTrailingComments : true
+AlwaysBreakAfterDefinitionReturnType: false
+
+# Control curly brace placement
+BreakBeforeBraces : Custom
+BraceWrapping:
+  AfterCaseLabel:  true
+  AfterClass:      true
+  AfterControlStatement: true
+  AfterEnum:       true
+  AfterFunction:   true
+  AfterNamespace:  true
+  AfterObjCDeclaration: false
+  AfterStruct:     true
+  AfterUnion:      true
+  AfterExternBlock: false
+  BeforeCatch:     true
+  BeforeElse:      true
+  BeforeLambdaBody: true
+  IndentBraces:    false
+  SplitEmptyFunction: false
+  SplitEmptyRecord: false
+  SplitEmptyNamespace: false
+
+# Pointer alignment
+DerivePointerAlignment: false
+PointerAlignment: Left
+
+# Single line config
 AllowShortIfStatementsOnASingleLine : true
-ConstructorInitializerAllOnOneLineOrOnePerLine : true
 AllowShortFunctionsOnASingleLine : true
 AllowShortLoopsOnASingleLine : false
-BinPackParameters : false
+AllowAllArgumentsOnNextLine : true
 AllowAllParametersOfDeclarationOnNextLine : false
-AlignTrailingComments : true
+BinPackArguments : true
+BinPackParameters : false
+ConstructorInitializerAllOnOneLineOrOnePerLine : true
 ColumnLimit : 80
-PenaltyBreakBeforeFirstCallParameter : 100
-PenaltyReturnTypeOnItsOwnLine : 65000
-PenaltyBreakString : 10
 
-# These improve formatting results but require clang 3.6/7 or higher
-BreakBeforeBinaryOperators : None
-AlignAfterOpenBracket: true
-BinPackArguments : false
+AlignAfterOpenBracket: Align
 AlignOperands : true
 AlwaysBreakTemplateDeclarations : true
-Cpp11BracedListStyle : true
+BreakBeforeBinaryOperators : None
 
+SpaceBeforeCpp11BracedList: true
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyBlock: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles: false
+SpacesInCStyleCastParentheses: false
+SpacesInContainerLiterals: false
+SpacesInConditionalStatement: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9b31cbe124..dbe5b3f113 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,7 +41,7 @@ project(RAJA LANGUAGES CXX C
   VERSION ${RAJA_LOADED})
 
 set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/thirdparty" ${CMAKE_MODULE_PATH})
-
+set(BLT_REQUIRED_CLANGFORMAT_VERSION  "14" CACHE STRING "")
 include(cmake/SetupRajaOptions.cmake)
 
 cmake_minimum_required(VERSION 3.23)
@@ -136,6 +136,9 @@ include(cmake/SetupCompilers.cmake)
 # Macros for building executables and libraries
 include (cmake/RAJAMacros.cmake)
 
+# Configure `style` target for enforcing code style
+raja_add_code_checks()
+
 set (raja_sources
   src/AlignedRangeIndexSetBuilders.cpp
   src/DepGraphNode.cpp
diff --git a/cmake/RAJAMacros.cmake b/cmake/RAJAMacros.cmake
index c412593db7..11c4661cc1 100644
--- a/cmake/RAJAMacros.cmake
+++ b/cmake/RAJAMacros.cmake
@@ -204,3 +204,62 @@ macro(raja_add_benchmark)
     NUM_OMP_THREADS ${arg_NUM_OMP_THREADS}
     COMMAND ${TEST_DRIVER} ${arg_NAME})
 endmacro(raja_add_benchmark)
+
+##------------------------------------------------------------------------------
+## raja_add_code_checks()
+##
+## Adds code checks for all source files recursively in the RAJA repository.
+##
+## This creates the following parent build targets:
+##  check - Runs a non file changing style check and CppCheck
+##  style - In-place code formatting
+##
+## Creates various child build targets that follow this pattern:
+##  raja_<check|style>
+##  raja_<cppcheck|clangformat>_<check|style>
+##------------------------------------------------------------------------------
+macro(raja_add_code_checks)
+
+  set(options)
+  set(singleValueArgs)
+  set(multiValueArgs)
+
+  # Parse the arguments to the macro
+  cmake_parse_arguments(arg
+       "${options}" "${singleValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  # Only do code checks if building raja by itself and not included in
+  # another project
+  if ("${PROJECT_SOURCE_DIR}" STREQUAL "${CMAKE_SOURCE_DIR}")
+      # Create file globbing expressions that only include directories that contain source
+      # TODO(bowen) Add examples, exercises and benchmark to the list below
+      set(_base_dirs "RAJA" "benchmark" "include" "src" "test")
+      set(_ext_expressions "*.cpp" "*.hpp" "*.inl"
+                           "*.cxx" "*.hxx" "*.cc" "*.c" "*.h" "*.hh")
+
+      set(_glob_expressions)
+      foreach(_exp ${_ext_expressions})
+          foreach(_base_dir ${_base_dirs})
+              list(APPEND _glob_expressions "${PROJECT_SOURCE_DIR}/${_base_dir}/${_exp}")
+          endforeach()
+      endforeach()
+
+      # Glob for list of files to run code checks on
+      set(_sources)
+      file(GLOB_RECURSE _sources ${_glob_expressions})
+
+      blt_add_code_checks(PREFIX          RAJA
+                          SOURCES         ${_sources}
+                          CLANGFORMAT_CFG_FILE ${PROJECT_SOURCE_DIR}/.clang-format
+                          CPPCHECK_FLAGS  --enable=all --inconclusive)
+
+      # Set FOLDER property for code check targets
+      foreach(_suffix clangformat_check clangformat_style clang_tidy_check clang_tidy_style)
+          set(_tgt ${arg_PREFIX}_${_suffix})
+          if(TARGET ${_tgt})
+              set_target_properties(${_tgt} PROPERTIES FOLDER "RAJA/code_checks")
+          endif()
+      endforeach()
+  endif()
+
+endmacro(raja_add_code_checks)
diff --git a/include/RAJA/RAJA.hpp b/include/RAJA/RAJA.hpp
index 59cca4bf22..abc965b0f5 100644
--- a/include/RAJA/RAJA.hpp
+++ b/include/RAJA/RAJA.hpp
@@ -88,7 +88,7 @@
 #endif
 
 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
-    #include "RAJA/policy/desul.hpp"
+#include "RAJA/policy/desul.hpp"
 #endif
 
 #include "RAJA/index/IndexSet.hpp"
@@ -197,11 +197,13 @@
 
 #include "RAJA/pattern/sort.hpp"
 
-namespace RAJA {
-namespace expt{}
+namespace RAJA
+{
+namespace expt
+{}
 //  // provide a RAJA::expt namespace for experimental work, but bring alias
 //  // it into RAJA so it doesn't affect user code
 //  using namespace expt;
-}
+}  // namespace RAJA
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/index/IndexSet.hpp b/include/RAJA/index/IndexSet.hpp
index 1a467c8341..3261c27b7a 100644
--- a/include/RAJA/index/IndexSet.hpp
+++ b/include/RAJA/index/IndexSet.hpp
@@ -34,8 +34,16 @@
 namespace RAJA
 {
 
-enum PushEnd { PUSH_FRONT, PUSH_BACK };
-enum PushCopy { PUSH_COPY, PUSH_NOCOPY };
+enum PushEnd
+{
+  PUSH_FRONT,
+  PUSH_BACK
+};
+enum PushCopy
+{
+  PUSH_COPY,
+  PUSH_NOCOPY
+};
 
 template <typename... TALL>
 class TypedIndexSet;
@@ -55,8 +63,9 @@ namespace indexset
 template <typename SEG_ITER_POLICY_T, typename SEG_EXEC_POLICY_T = void>
 struct ExecPolicy
     : public RAJA::make_policy_pattern_t<SEG_EXEC_POLICY_T::policy,
-                                         RAJA::Pattern::forall> {
-  using seg_it = SEG_ITER_POLICY_T;
+                                         RAJA::Pattern::forall>
+{
+  using seg_it   = SEG_ITER_POLICY_T;
   using seg_exec = SEG_EXEC_POLICY_T;
 };
 
@@ -77,7 +86,7 @@ using policy::indexset::ExecPolicy;
 template <typename T0, typename... TREST>
 class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 {
-  using PARENT = TypedIndexSet<TREST...>;
+  using PARENT               = TypedIndexSet<TREST...>;
   static const int T0_TypeId = sizeof...(TREST);
 
 public:
@@ -91,7 +100,7 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
   //! Construct empty index set
 #if _MSC_VER < 1910
-   // this one instance of constexpr does not work on VS2012 or VS2015
+  // this one instance of constexpr does not work on VS2012 or VS2015
   RAJA_INLINE TypedIndexSet() : PARENT() {}
 #else
   RAJA_INLINE constexpr TypedIndexSet() : PARENT() {}
@@ -99,12 +108,12 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
   //! Copy-constructor for index set
   RAJA_INLINE
-  TypedIndexSet(TypedIndexSet<T0, TREST...> const &c)
-      : PARENT((PARENT const &)c)
+  TypedIndexSet(TypedIndexSet<T0, TREST...> const& c) : PARENT((PARENT const&)c)
   {
     size_t num = c.data.size();
     data.resize(num);
-    for (size_t i = 0; i < num; ++i) {
+    for (size_t i = 0; i < num; ++i)
+    {
       data[i] = c.data[i];
     }
     // mark all as not owned by us
@@ -112,9 +121,10 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   }
 
   //! Copy-assignment operator for index set
-  TypedIndexSet<T0, TREST...> &operator=(const TypedIndexSet<T0, TREST...> &rhs)
+  TypedIndexSet<T0, TREST...>& operator=(const TypedIndexSet<T0, TREST...>& rhs)
   {
-    if (&rhs != this) {
+    if (&rhs != this)
+    {
       TypedIndexSet<T0, TREST...> copy(rhs);
       this->swap(copy);
     }
@@ -125,19 +135,21 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   RAJA_INLINE ~TypedIndexSet()
   {
     size_t num_seg = data.size();
-    for (size_t i = 0; i < num_seg; ++i) {
+    for (size_t i = 0; i < num_seg; ++i)
+    {
       // Only free segment of we allocated it
-      if (owner[i]) {
+      if (owner[i])
+      {
         delete data[i];
       }
     }
   }
 
   //! Swap function for copy-and-swap idiom.
-  void swap(TypedIndexSet<T0, TREST...> &other)
+  void swap(TypedIndexSet<T0, TREST...>& other)
   {
     // Swap parents data
-    PARENT::swap((PARENT &)other);
+    PARENT::swap((PARENT&)other);
     // Swap our data
     using std::swap;
     swap(data, other.data);
@@ -150,18 +162,20 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   /// This is used to implement the == and != operators
   ///
   template <typename P0, typename... PREST>
-  RAJA_INLINE bool compareSegmentById(
-      size_t segid,
-      const TypedIndexSet<P0, PREST...> &other) const
+  RAJA_INLINE bool
+  compareSegmentById(size_t segid,
+                     const TypedIndexSet<P0, PREST...>& other) const
   {
     // drill down our types until we have the right type
-    if (getSegmentTypes()[segid] != T0_TypeId) {
+    if (getSegmentTypes()[segid] != T0_TypeId)
+    {
       // peel off T0
       return PARENT::compareSegmentById(segid, other);
     }
 
     // Check that other's segid is of type T0
-    if (!other.template checkSegmentType<T0>(segid)) {
+    if (!other.template checkSegmentType<T0>(segid))
+    {
       return false;
     }
 
@@ -174,7 +188,8 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   template <typename P0>
   RAJA_INLINE bool checkSegmentType(size_t segid) const
   {
-    if (getSegmentTypes()[segid] == T0_TypeId) {
+    if (getSegmentTypes()[segid] == T0_TypeId)
+    {
       return std::is_same<T0, P0>::value;
     }
     return PARENT::template checkSegmentType<P0>(segid);
@@ -183,22 +198,24 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
   //! get specified segment by ID
   template <typename P0>
-  RAJA_INLINE P0 &getSegment(size_t segid)
+  RAJA_INLINE P0& getSegment(size_t segid)
   {
-    if (getSegmentTypes()[segid] == T0_TypeId) {
+    if (getSegmentTypes()[segid] == T0_TypeId)
+    {
       Index_type offset = getSegmentOffsets()[segid];
-      return *reinterpret_cast<P0 const *>(data[offset]);
+      return *reinterpret_cast<P0 const*>(data[offset]);
     }
     return PARENT::template getSegment<P0>(segid);
   }
 
   //! get specified segment by ID
   template <typename P0>
-  RAJA_INLINE P0 const &getSegment(size_t segid) const
+  RAJA_INLINE P0 const& getSegment(size_t segid) const
   {
-    if (getSegmentTypes()[segid] == T0_TypeId) {
+    if (getSegmentTypes()[segid] == T0_TypeId)
+    {
       Index_type offset = getSegmentOffsets()[segid];
-      return *reinterpret_cast<P0 const *>(data[offset]);
+      return *reinterpret_cast<P0 const*>(data[offset]);
     }
     return PARENT::template getSegment<P0>(segid);
   }
@@ -231,20 +248,25 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
 private:
   template <typename... CALL>
-  RAJA_INLINE void push_into(TypedIndexSet<CALL...> &c,
-                             PushEnd pend = PUSH_BACK,
+  RAJA_INLINE void push_into(TypedIndexSet<CALL...>& c,
+                             PushEnd pend   = PUSH_BACK,
                              PushCopy pcopy = PUSH_COPY)
   {
     Index_type num = getNumSegments();
 
-    if (pend == PUSH_BACK) {
-      for (Index_type i = 0; i < num; ++i) {
+    if (pend == PUSH_BACK)
+    {
+      for (Index_type i = 0; i < num; ++i)
+      {
         segment_push_into(i, c, pend, pcopy);
-      } 
-    } else {
-      for (Index_type i = num-1; i > -1; --i) {
+      }
+    }
+    else
+    {
+      for (Index_type i = num - 1; i > -1; --i)
+      {
         segment_push_into(i, c, pend, pcopy);
-      } 
+      }
     }
   }
 
@@ -257,66 +279,71 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 public:
   template <typename... CALL>
   RAJA_INLINE void segment_push_into(size_t segid,
-                                     TypedIndexSet<CALL...> &c,
-                                     PushEnd pend = PUSH_BACK,
+                                     TypedIndexSet<CALL...>& c,
+                                     PushEnd pend   = PUSH_BACK,
                                      PushCopy pcopy = PUSH_COPY)
   {
-    if (getSegmentTypes()[segid] != T0_TypeId) {
+    if (getSegmentTypes()[segid] != T0_TypeId)
+    {
       PARENT::segment_push_into(segid, c, pend, pcopy);
       return;
     }
     Index_type offset = getSegmentOffsets()[segid];
-    switch (value_for(pend, pcopy)) {
-      case value_for(PUSH_BACK, PUSH_COPY):
-        c.push_back(*data[offset]);
-        break;
-      case value_for(PUSH_BACK, PUSH_NOCOPY):
-        c.push_back_nocopy(data[offset]);
-        break;
-      case value_for(PUSH_FRONT, PUSH_COPY):
-        c.push_front(*data[offset]);
-        break;
-      case value_for(PUSH_FRONT, PUSH_NOCOPY):
-        c.push_front_nocopy(data[offset]);
-        break;
+    switch (value_for(pend, pcopy))
+    {
+    case value_for(PUSH_BACK, PUSH_COPY):
+      c.push_back(*data[offset]);
+      break;
+    case value_for(PUSH_BACK, PUSH_NOCOPY):
+      c.push_back_nocopy(data[offset]);
+      break;
+    case value_for(PUSH_FRONT, PUSH_COPY):
+      c.push_front(*data[offset]);
+      break;
+    case value_for(PUSH_FRONT, PUSH_NOCOPY):
+      c.push_front_nocopy(data[offset]);
+      break;
     }
   }
 
 
   //! Add segment to back end of index set without making a copy.
   template <typename Tnew>
-  RAJA_INLINE void push_back_nocopy(Tnew *val)
+  RAJA_INLINE void push_back_nocopy(Tnew* val)
   {
     push_internal(val, PUSH_BACK, PUSH_NOCOPY);
   }
 
   //! Add segment to front end of index set without making a copy.
   template <typename Tnew>
-  RAJA_INLINE void push_front_nocopy(Tnew *val)
+  RAJA_INLINE void push_front_nocopy(Tnew* val)
   {
     push_internal(val, PUSH_FRONT, PUSH_NOCOPY);
   }
 
   //! Add copy of segment to back end of index set.
   template <typename Tnew>
-  RAJA_INLINE void push_back(Tnew &&val)
+  RAJA_INLINE void push_back(Tnew&& val)
   {
-    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)), PUSH_BACK, PUSH_COPY);
+    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)),
+                  PUSH_BACK, PUSH_COPY);
   }
 
   //! Add copy of segment to front end of index set.
   template <typename Tnew>
-  RAJA_INLINE void push_front(Tnew &&val)
+  RAJA_INLINE void push_front(Tnew&& val)
   {
-    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)), PUSH_FRONT, PUSH_COPY);
+    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)),
+                  PUSH_FRONT, PUSH_COPY);
   }
 
   //! Return total length -- sum of lengths of all segments
   RAJA_INLINE size_t getLength() const
   {
     size_t total = PARENT::getLength();
-    size_t num = data.size();
-    for (size_t i = 0; i < num; ++i) {
+    size_t num   = data.size();
+    for (size_t i = 0; i < num; ++i)
+    {
       total += data[i]->size();
     }
     return total;
@@ -339,13 +366,12 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   ///
   RAJA_SUPPRESS_HD_WARN
   template <typename BODY, typename... ARGS>
-  RAJA_HOST_DEVICE void segmentCall(size_t segid,
-                                    BODY &&body,
-                                    ARGS &&... args) const
+  RAJA_HOST_DEVICE void
+  segmentCall(size_t segid, BODY&& body, ARGS&&... args) const
   {
-    if (getSegmentTypes()[segid] != T0_TypeId) {
-      PARENT::segmentCall(segid,
-                          std::forward<BODY>(body),
+    if (getSegmentTypes()[segid] != T0_TypeId)
+    {
+      PARENT::segmentCall(segid, std::forward<BODY>(body),
                           std::forward<ARGS>(args)...);
       return;
     }
@@ -356,24 +382,23 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 protected:
   //! Internal logic to add a new segment -- catch invalid type insertion
   template <typename Tnew>
-  RAJA_INLINE void push_internal(Tnew *val,
-                                 PushEnd pend = PUSH_BACK,
-                                 PushCopy pcopy = PUSH_COPY)
+  RAJA_INLINE void
+  push_internal(Tnew* val, PushEnd pend = PUSH_BACK, PushCopy pcopy = PUSH_COPY)
   {
     static_assert(sizeof...(TREST) > 0, "Invalid type for this TypedIndexSet");
     PARENT::push_internal(val, pend, pcopy);
   }
 
   //! Internal logic to add a new segment
-  RAJA_INLINE void push_internal(T0 *val,
-                                 PushEnd pend = PUSH_BACK,
-                                 PushCopy pcopy = PUSH_COPY)
+  RAJA_INLINE void
+  push_internal(T0* val, PushEnd pend = PUSH_BACK, PushCopy pcopy = PUSH_COPY)
   {
     data.push_back(val);
     owner.push_back(pcopy == PUSH_COPY);
 
     // Determine if we push at the front or back of the segment list
-    if (pend == PUSH_BACK) {
+    if (pend == PUSH_BACK)
+    {
       // Store the segment type
       getSegmentTypes().push_back(T0_TypeId);
 
@@ -384,7 +409,9 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
       size_t icount = val->size();
       getSegmentIcounts().push_back(getTotalLength());
       increaseTotalLength(icount);
-    } else {
+    }
+    else
+    {
       // Store the segment type
       getSegmentTypes().push_front(T0_TypeId);
 
@@ -394,7 +421,8 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
       // Store the segment icount
       getSegmentIcounts().push_front(0);
       size_t icount = val->size();
-      for (size_t i = 1; i < getSegmentIcounts().size(); ++i) {
+      for (size_t i = 1; i < getSegmentIcounts().size(); ++i)
+      {
         getSegmentIcounts()[i] += icount;
       }
       increaseTotalLength(icount);
@@ -402,7 +430,7 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   }
 
   //! Returns the number of indices (the total icount of segments
-  RAJA_INLINE Index_type &getTotalLength() { return PARENT::getTotalLength(); }
+  RAJA_INLINE Index_type& getTotalLength() { return PARENT::getTotalLength(); }
 
   //! set total length of the indexset
   RAJA_INLINE void setTotalLength(int n) { return PARENT::setTotalLength(n); }
@@ -437,9 +465,10 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   {
     TypedIndexSet<T0, TREST...> retVal;
 
-    int minSeg = RAJA::operators::maximum<int>{}(0, begin);
-    int maxSeg = RAJA::operators::minimum<int>{}(end, getNumSegments());
-    for (int i = minSeg; i < maxSeg; ++i) {
+    int minSeg = RAJA::operators::maximum<int> {}(0, begin);
+    int maxSeg = RAJA::operators::minimum<int> {}(end, getNumSegments());
+    for (int i = minSeg; i < maxSeg; ++i)
+    {
       segment_push_into(i, retVal, PUSH_BACK, PUSH_NOCOPY);
     }
     return retVal;
@@ -452,13 +481,15 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   /// This TypedIndexSet will not change and the created "slice" into it
   /// will not own any of its segments.
   ///
-  TypedIndexSet<T0, TREST...> createSlice(const int *segIds, int len)
+  TypedIndexSet<T0, TREST...> createSlice(const int* segIds, int len)
   {
     TypedIndexSet<T0, TREST...> retVal;
 
     int numSeg = getNumSegments();
-    for (int i = 0; i < len; ++i) {
-      if (segIds[i] >= 0 && segIds[i] < numSeg) {
+    for (int i = 0; i < len; ++i)
+    {
+      if (segIds[i] >= 0 && segIds[i] < numSeg)
+      {
         segment_push_into(segIds[i], retVal, PUSH_BACK, PUSH_NOCOPY);
       }
     }
@@ -476,12 +507,14 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   /// iterator type must de-reference to an integral value.
   ///
   template <typename T>
-  TypedIndexSet<T0, TREST...> createSlice(const T &segIds)
+  TypedIndexSet<T0, TREST...> createSlice(const T& segIds)
   {
     TypedIndexSet<T0, TREST...> retVal;
     int numSeg = getNumSegments();
-    for (auto &seg : segIds) {
-      if (seg >= 0 && seg < numSeg) {
+    for (auto& seg : segIds)
+    {
+      if (seg >= 0 && seg < numSeg)
+      {
         segment_push_into(seg, retVal, PUSH_BACK, PUSH_NOCOPY);
       }
     }
@@ -492,7 +525,7 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   void setSegmentInterval(size_t interval_id, int begin, int end)
   {
     m_seg_interval_begin[interval_id] = begin;
-    m_seg_interval_end[interval_id] = end;
+    m_seg_interval_end[interval_id]   = end;
   }
 
   //! get lower bound of segment identified with interval_id
@@ -509,37 +542,37 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
 protected:
   //! Returns the mapping of  segment_index -> segment_type
-  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentTypes()
+  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentTypes()
   {
     return PARENT::getSegmentTypes();
   }
 
   //! Returns the mapping of  segment_index -> segment_type
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentTypes() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentTypes() const
   {
     return PARENT::getSegmentTypes();
   }
 
   //! Returns the mapping of  segment_index -> segment_offset
-  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentOffsets()
+  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentOffsets()
   {
     return PARENT::getSegmentOffsets();
   }
 
   //! Returns the mapping of  segment_index -> segment_offset
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentOffsets() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentOffsets() const
   {
     return PARENT::getSegmentOffsets();
   }
 
   //! Returns the icount of segments
-  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentIcounts()
+  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentIcounts()
   {
     return PARENT::getSegmentIcounts();
   }
 
   //! Returns the icount of segments
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentIcounts() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentIcounts() const
   {
     return PARENT::getSegmentIcounts();
   }
@@ -552,13 +585,15 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   ///       types and indices; e.g., dependency info not checked.
   ///
   template <typename P0, typename... PREST>
-  RAJA_INLINE bool operator==(const TypedIndexSet<P0, PREST...> &other) const
+  RAJA_INLINE bool operator==(const TypedIndexSet<P0, PREST...>& other) const
   {
     size_t num_seg = getNumSegments();
     if (num_seg != other.getNumSegments()) return false;
 
-    for (size_t segid = 0; segid < num_seg; ++segid) {
-      if (!compareSegmentById(segid, other)) {
+    for (size_t segid = 0; segid < num_seg; ++segid)
+    {
+      if (!compareSegmentById(segid, other))
+      {
         return false;
       }
     }
@@ -567,14 +602,14 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
   //! Inequality operator returns true if any segment is not equal, else false.
   template <typename P0, typename... PREST>
-  RAJA_INLINE bool operator!=(const TypedIndexSet<P0, PREST...> &other) const
+  RAJA_INLINE bool operator!=(const TypedIndexSet<P0, PREST...>& other) const
   {
     return (!(*this == other));
   }
 
 private:
   //! vector of TypedIndexSet data objects of type T0
-  RAJA::RAJAVec<T0 *> data;
+  RAJA::RAJAVec<T0*> data;
 
   //! vector indicating which segments are owned by the TypedIndexSet
   RAJA::RAJAVec<Index_type> owner;
@@ -603,16 +638,16 @@ class TypedIndexSet<>
 
   //! Copy-constructor.
   RAJA_INLINE
-  TypedIndexSet(TypedIndexSet const &c)
+  TypedIndexSet(TypedIndexSet const& c)
   {
-    segment_types = c.segment_types;
+    segment_types   = c.segment_types;
     segment_offsets = c.segment_offsets;
     segment_icounts = c.segment_icounts;
-    m_len = c.m_len;
+    m_len           = c.m_len;
   }
 
   //! Swap function for copy-and-swap idiom (deep copy).
-  void swap(TypedIndexSet &other)
+  void swap(TypedIndexSet& other)
   {
     using std::swap;
     swap(segment_types, other.segment_types);
@@ -625,7 +660,7 @@ class TypedIndexSet<>
   RAJA_INLINE static size_t getNumTypes() { return 0; }
 
   template <typename T>
-  RAJA_INLINE constexpr bool isValidSegmentType(T const &) const
+  RAJA_INLINE constexpr bool isValidSegmentType(T const&) const
   {
     // Segment type wasn't found
     return false;
@@ -637,40 +672,39 @@ class TypedIndexSet<>
 
   template <typename BODY, typename... ARGS>
   RAJA_INLINE void segmentCall(size_t, BODY, ARGS...) const
-  {
-  }
+  {}
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentTypes()
+  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentTypes()
   {
     return segment_types;
   }
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentTypes() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentTypes() const
   {
     return segment_types;
   }
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentOffsets()
+  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentOffsets()
   {
     return segment_offsets;
   }
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentOffsets() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentOffsets() const
   {
     return segment_offsets;
   }
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentIcounts()
+  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentIcounts()
   {
     return segment_icounts;
   }
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentIcounts() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentIcounts() const
   {
     return segment_icounts;
   }
 
-  RAJA_INLINE Index_type &getTotalLength() { return m_len; }
+  RAJA_INLINE Index_type& getTotalLength() { return m_len; }
 
   RAJA_INLINE void setTotalLength(int n) { m_len = n; }
 
@@ -678,7 +712,7 @@ class TypedIndexSet<>
 
   template <typename P0, typename... PREST>
   RAJA_INLINE bool compareSegmentById(size_t,
-                                      const TypedIndexSet<P0, PREST...> &) const
+                                      const TypedIndexSet<P0, PREST...>&) const
   {
     return false;
   }
@@ -690,34 +724,29 @@ class TypedIndexSet<>
   }
 
   template <typename P0>
-  RAJA_INLINE P0 &getSegment(size_t)
+  RAJA_INLINE P0& getSegment(size_t)
   {
-    return *((P0 *)(this - this));
+    return *((P0*)(this - this));
   }
 
   template <typename P0>
-  RAJA_INLINE P0 const &getSegment(size_t) const
+  RAJA_INLINE P0 const& getSegment(size_t) const
   {
-    return *((P0 *)(this - this));
+    return *((P0*)(this - this));
   }
 
   template <typename... CALL>
-  RAJA_INLINE void push_into(TypedIndexSet<CALL...> &, PushEnd, PushCopy) const
-  {
-  }
+  RAJA_INLINE void push_into(TypedIndexSet<CALL...>&, PushEnd, PushCopy) const
+  {}
 
   template <typename... CALL>
-  RAJA_INLINE void segment_push_into(size_t,
-                                     TypedIndexSet<CALL...> &,
-                                     PushEnd,
-                                     PushCopy) const
-  {
-  }
+  RAJA_INLINE void
+  segment_push_into(size_t, TypedIndexSet<CALL...>&, PushEnd, PushCopy) const
+  {}
 
   template <typename Tnew>
-  RAJA_INLINE void push(Tnew const &, PushEnd, PushCopy)
-  {
-  }
+  RAJA_INLINE void push(Tnew const&, PushEnd, PushCopy)
+  {}
 
 public:
   using iterator = Iterators::numeric_iterator<Index_type>;
@@ -762,13 +791,15 @@ namespace type_traits
 
 template <typename T>
 struct is_index_set
-    : ::RAJA::type_traits::SpecializationOf<RAJA::TypedIndexSet, typename std::decay<T>::type> {
-};
+    : ::RAJA::type_traits::SpecializationOf<RAJA::TypedIndexSet,
+                                            typename std::decay<T>::type>
+{};
 
 template <typename T>
 struct is_indexset_policy
-    : ::RAJA::type_traits::SpecializationOf<RAJA::ExecPolicy, typename std::decay<T>::type> {
-};
+    : ::RAJA::type_traits::SpecializationOf<RAJA::ExecPolicy,
+                                            typename std::decay<T>::type>
+{};
 }  // namespace type_traits
 
 }  // namespace RAJA
diff --git a/include/RAJA/index/IndexSetBuilders.hpp b/include/RAJA/index/IndexSetBuilders.hpp
index 543524be01..075aecd1d1 100644
--- a/include/RAJA/index/IndexSetBuilders.hpp
+++ b/include/RAJA/index/IndexSetBuilders.hpp
@@ -37,13 +37,13 @@ namespace RAJA
  * \brief Generate an index set with aligned Range segments and List segments,
  *        as needed, from given array of indices.
  *
- *        Routine does no error-checking on argements and assumes 
+ *        Routine does no error-checking on argements and assumes
  *        RAJA::Index_type array contains valid indices.
  *
- *  \param iset reference to index set generated with aligned range segments 
+ *  \param iset reference to index set generated with aligned range segments
  *         and list segments. Method assumes index set is empty (no segments).
- *  \param work_res camp resource object that identifies the memory space in 
- *         which list segment index data will live (passed to list segment 
+ *  \param work_res camp resource object that identifies the memory space in
+ *         which list segment index data will live (passed to list segment
  *         ctor).
  *  \param indices_in pointer to start of input array of indices.
  *  \param length size of input index array.
@@ -79,37 +79,36 @@ void RAJASHAREDDLL_API buildIndexSetAligned(
  ******************************************************************************
  *
  * \brief Generate a lock-free "block" index set (planar division) containing
- *        range segments. 
+ *        range segments.
  *
- *        The method chunks a fastDim x midDim x slowDim mesh into blocks that 
+ *        The method chunks a fastDim x midDim x slowDim mesh into blocks that
  *        can be dependency-scheduled, removing need for lock constructs.
  *
  *  \param iset reference to index set generated with range segments.
- *         Method assumes index set is empty (no segments). 
+ *         Method assumes index set is empty (no segments).
  *  \param fastDim "fast" block dimension (see above).
  *  \param midDim  "mid" block dimension (see above).
  *  \param slowDim "slow" block dimension (see above).
  *
  ******************************************************************************
  */
-void buildLockFreeBlockIndexset(
-    RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
-    int fastDim,
-    int midDim,
-    int slowDim);
+void buildLockFreeBlockIndexset(RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
+                                int fastDim,
+                                int midDim,
+                                int slowDim);
 
 /*!
  ******************************************************************************
  *
  * \brief Generate a lock-free "color" index set containing range and list
  *        segments.
- * 
- *        TThe domain-set is colored based on connectivity to the range-set. 
- *        All elements in each segment are independent, and no two segments 
+ *
+ *        TThe domain-set is colored based on connectivity to the range-set.
+ *        All elements in each segment are independent, and no two segments
  *        can be executed in parallel.
  *
- * \param iset reference to index set generated. Method assumes index set 
- *        is empty (no segments). 
+ * \param iset reference to index set generated. Method assumes index set
+ *        is empty (no segments).
  * \param work_res camp resource object that identifies the memory space in
  *         which list segment index data will live (passed to list segment
  *         ctor).
@@ -123,7 +122,7 @@ void buildLockFreeColorIndexset(
     int numEntity,
     int numRangePerDomain,
     int numEntityRange,
-    RAJA::Index_type* elemPermutation = nullptr,
+    RAJA::Index_type* elemPermutation  = nullptr,
     RAJA::Index_type* ielemPermutation = nullptr);
 
 }  // namespace RAJA
diff --git a/include/RAJA/index/IndexSetUtils.hpp b/include/RAJA/index/IndexSetUtils.hpp
index 4baea450fc..d5da3e9e19 100644
--- a/include/RAJA/index/IndexSetUtils.hpp
+++ b/include/RAJA/index/IndexSetUtils.hpp
@@ -31,10 +31,10 @@ namespace RAJA
 //@{
 //!   @name Methods to gather indices of segment or index set into a container.
 //!
-//!   For each method, the given container must be templated on a data type, 
-//!   have default and copy ctors, push_back method, and value_type. Is is 
-//!   assumed that the container data type and segment or index set data type 
-//!   are compatible in the sense that the index set type can be converted to 
+//!   For each method, the given container must be templated on a data type,
+//!   have default and copy ctors, push_back method, and value_type. Is is
+//!   assumed that the container data type and segment or index set data type
+//!   are compatible in the sense that the index set type can be converted to
 //!   the container data type.
 
 /*!
@@ -49,11 +49,8 @@ RAJA_INLINE void getIndices(CONTAINER_T& con,
                             const TypedIndexSet<SEG_TYPES...>& iset)
 {
   CONTAINER_T tcon;
-  forall<ExecPolicy<seq_segit, seq_exec> >(iset,
-    [&](typename CONTAINER_T::value_type idx) {
-      tcon.push_back(idx);
-    }
-  );
+  forall<ExecPolicy<seq_segit, seq_exec>>(
+      iset, [&](typename CONTAINER_T::value_type idx) { tcon.push_back(idx); });
   con = tcon;
 }
 
@@ -68,11 +65,8 @@ template <typename CONTAINER_T, typename SEGMENT_T>
 RAJA_INLINE void getIndices(CONTAINER_T& con, const SEGMENT_T& seg)
 {
   CONTAINER_T tcon;
-  forall<seq_exec>(seg,
-    [&](typename CONTAINER_T::value_type idx) {
-      tcon.push_back(idx);
-    }
-  );
+  forall<seq_exec>(seg, [&](typename CONTAINER_T::value_type idx)
+                   { tcon.push_back(idx); });
   con = tcon;
 }
 
@@ -90,11 +84,12 @@ RAJA_INLINE void getIndicesConditional(CONTAINER_T& con,
                                        CONDITIONAL conditional)
 {
   CONTAINER_T tcon;
-  forall<ExecPolicy<seq_segit, seq_exec> >(iset,
-    [&](typename CONTAINER_T::value_type idx) {
-      if (conditional(idx)) tcon.push_back(idx);
-    }
-  );
+  forall<ExecPolicy<seq_segit, seq_exec>>(
+      iset,
+      [&](typename CONTAINER_T::value_type idx)
+      {
+        if (conditional(idx)) tcon.push_back(idx);
+      });
   con = tcon;
 }
 
@@ -113,10 +108,10 @@ RAJA_INLINE void getIndicesConditional(CONTAINER_T& con,
 {
   CONTAINER_T tcon;
   forall<seq_exec>(seg,
-    [&](typename CONTAINER_T::value_type idx) {
-      if (conditional(idx)) tcon.push_back(idx);
-    }
-  );
+                   [&](typename CONTAINER_T::value_type idx)
+                   {
+                     if (conditional(idx)) tcon.push_back(idx);
+                   });
   con = tcon;
 }
 
diff --git a/include/RAJA/index/IndexValue.hpp b/include/RAJA/index/IndexValue.hpp
index 44fa143445..7ed94a299e 100644
--- a/include/RAJA/index/IndexValue.hpp
+++ b/include/RAJA/index/IndexValue.hpp
@@ -28,8 +28,8 @@
 namespace RAJA
 {
 
-struct IndexValueBase {
-};
+struct IndexValueBase
+{};
 
 /*!
  * \brief Strongly typed "integer" class.
@@ -44,16 +44,17 @@ struct IndexValueBase {
  * Yes, this uses the curiously-recurring template pattern.
  */
 template <typename TYPE, typename VALUE = RAJA::Index_type>
-struct IndexValue : public IndexValueBase {
+struct IndexValue : public IndexValueBase
+{
 
   using value_type = VALUE;
 
   //! Default constructor initializes value to 0.
-  RAJA_INLINE constexpr IndexValue() = default;
-  constexpr RAJA_INLINE IndexValue(IndexValue const &) = default;
-  constexpr RAJA_INLINE IndexValue(IndexValue &&) = default;
-  RAJA_INLINE IndexValue &operator=(IndexValue const &) = default;
-  RAJA_INLINE IndexValue &operator=(IndexValue &&) = default;
+  RAJA_INLINE constexpr IndexValue()                   = default;
+  constexpr RAJA_INLINE IndexValue(IndexValue const&)  = default;
+  constexpr RAJA_INLINE IndexValue(IndexValue&&)       = default;
+  RAJA_INLINE IndexValue& operator=(IndexValue const&) = default;
+  RAJA_INLINE IndexValue& operator=(IndexValue&&)      = default;
 
   /*!
    * \brief Explicit constructor.
@@ -61,14 +62,13 @@ struct IndexValue : public IndexValueBase {
    */
   RAJA_HOST_DEVICE RAJA_INLINE constexpr explicit IndexValue(value_type v)
       : value(v)
-  {
-  }
+  {}
 
   //! Dereference provides cast-to-integer.
-  RAJA_HOST_DEVICE RAJA_INLINE value_type &operator*() { return value; }
+  RAJA_HOST_DEVICE RAJA_INLINE value_type& operator*() { return value; }
 
   //! Dereference provides cast-to-integer.
-  RAJA_HOST_DEVICE RAJA_INLINE const value_type &operator*() const
+  RAJA_HOST_DEVICE RAJA_INLINE const value_type& operator*() const
   {
     return value;
   }
@@ -82,10 +82,10 @@ struct IndexValue : public IndexValueBase {
   }
 
   //! preincrement stored index
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator++()
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator++()
   {
     value++;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
   //! postdecrement -- returns a copy
@@ -97,10 +97,10 @@ struct IndexValue : public IndexValueBase {
   }
 
   //! preincrement stored index
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator--()
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator--()
   {
     value--;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
   //! addition to underlying index from an Index_type
@@ -163,52 +163,52 @@ struct IndexValue : public IndexValueBase {
     return TYPE(value % a.value);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator+=(value_type x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator+=(value_type x)
   {
     value += x;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator+=(TYPE x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator+=(TYPE x)
   {
     value += x.value;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator-=(value_type x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator-=(value_type x)
   {
     value -= x;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator-=(TYPE x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator-=(TYPE x)
   {
     value -= x.value;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator*=(value_type x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator*=(value_type x)
   {
     value *= x;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator*=(TYPE x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator*=(TYPE x)
   {
     value *= x.value;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator/=(value_type x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator/=(value_type x)
   {
     value /= x;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator/=(TYPE x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator/=(TYPE x)
   {
     value /= x.value;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
   RAJA_HOST_DEVICE RAJA_INLINE bool operator<(value_type x) const
@@ -334,18 +334,22 @@ constexpr RAJA_HOST_DEVICE RAJA_INLINE
   return val;
 }
 
-namespace internal{
-template<typename FROM, typename Enable = void>
-struct StripIndexTypeT {
-    using type = FROM;
+namespace internal
+{
+template <typename FROM, typename Enable = void>
+struct StripIndexTypeT
+{
+  using type = FROM;
 };
 
-template<typename FROM>
-struct StripIndexTypeT<FROM, typename std::enable_if<std::is_base_of<IndexValueBase, FROM>::value>::type>
+template <typename FROM>
+struct StripIndexTypeT<
+    FROM,
+    typename std::enable_if<std::is_base_of<IndexValueBase, FROM>::value>::type>
 {
-    using type = typename FROM::value_type;
+  using type = typename FROM::value_type;
 };
-} // namespace internal
+}  // namespace internal
 
 /*!
  * \brief Strips a strongly typed index to its underlying type
@@ -353,7 +357,7 @@ struct StripIndexTypeT<FROM, typename std::enable_if<std::is_base_of<IndexValueB
  *
  * \param FROM the original type
  */
-template<typename FROM>
+template <typename FROM>
 using strip_index_type_t = typename internal::StripIndexTypeT<FROM>::type;
 
 /*!
@@ -362,12 +366,11 @@ using strip_index_type_t = typename internal::StripIndexTypeT<FROM>::type;
  *
  * \param FROM the original type
  */
-template<typename FROM>
-using make_signed_t = typename std::conditional < 
-                                  std::is_floating_point<FROM>::value,
-                                    std::common_type<FROM>,
-                                    std::make_signed<FROM>
-                               >::type::type;
+template <typename FROM>
+using make_signed_t =
+    typename std::conditional<std::is_floating_point<FROM>::value,
+                              std::common_type<FROM>,
+                              std::make_signed<FROM>>::type::type;
 
 }  // namespace RAJA
 
@@ -376,19 +379,18 @@ using make_signed_t = typename std::conditional <
  * \param TYPE the name of the type
  * \param NAME a string literal to identify this index type
  */
-#define RAJA_INDEX_VALUE(TYPE, NAME)                                 \
-  class TYPE : public ::RAJA::IndexValue<TYPE>                       \
-  {                                                                  \
-    using parent = ::RAJA::IndexValue<TYPE>;                         \
-                                                                     \
-  public:                                                            \
-    using IndexValueType = TYPE;                                     \
-    RAJA_HOST_DEVICE RAJA_INLINE TYPE() : parent::IndexValue() {}    \
-    RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(::RAJA::Index_type v) \
-        : parent::IndexValue(v)                                      \
-    {                                                                \
-    }                                                                \
-    static inline std::string getName() { return NAME; }             \
+#define RAJA_INDEX_VALUE(TYPE, NAME)                                           \
+  class TYPE : public ::RAJA::IndexValue<TYPE>                                 \
+  {                                                                            \
+    using parent = ::RAJA::IndexValue<TYPE>;                                   \
+                                                                               \
+  public:                                                                      \
+    using IndexValueType = TYPE;                                               \
+    RAJA_HOST_DEVICE RAJA_INLINE TYPE() : parent::IndexValue() {}              \
+    RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(::RAJA::Index_type v)           \
+        : parent::IndexValue(v)                                                \
+    {}                                                                         \
+    static inline std::string getName() { return NAME; }                       \
   };
 
 /*!
@@ -397,17 +399,17 @@ using make_signed_t = typename std::conditional <
  * \param IDXT the index types value type
  * \param NAME a string literal to identify this index type
  */
-#define RAJA_INDEX_VALUE_T(TYPE, IDXT, NAME)                         \
-  class TYPE : public ::RAJA::IndexValue<TYPE, IDXT>                 \
-  {                                                                  \
-  public:                                                            \
-    RAJA_HOST_DEVICE RAJA_INLINE TYPE()                              \
-        : RAJA::IndexValue<TYPE,IDXT>::IndexValue() {}               \
-    RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(IDXT v)               \
-        : RAJA::IndexValue<TYPE,IDXT>::IndexValue(v)                 \
-    {                                                                \
-    }                                                                \
-    static inline std::string getName() { return NAME; }             \
+#define RAJA_INDEX_VALUE_T(TYPE, IDXT, NAME)                                   \
+  class TYPE : public ::RAJA::IndexValue<TYPE, IDXT>                           \
+  {                                                                            \
+  public:                                                                      \
+    RAJA_HOST_DEVICE RAJA_INLINE TYPE()                                        \
+        : RAJA::IndexValue<TYPE, IDXT>::IndexValue()                           \
+    {}                                                                         \
+    RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(IDXT v)                         \
+        : RAJA::IndexValue<TYPE, IDXT>::IndexValue(v)                          \
+    {}                                                                         \
+    static inline std::string getName() { return NAME; }                       \
   };
 
 #endif
diff --git a/include/RAJA/index/ListSegment.hpp b/include/RAJA/index/ListSegment.hpp
index adee46053c..187ec05d3f 100644
--- a/include/RAJA/index/ListSegment.hpp
+++ b/include/RAJA/index/ListSegment.hpp
@@ -85,7 +85,6 @@ template <typename StorageT>
 class TypedListSegment
 {
 public:
-
   //@{
   //!   @name Types used in implementation based on template parameter.
 
@@ -111,7 +110,7 @@ class TypedListSegment
    * \param values array of indices defining iteration space of segment
    * \param length number of indices
    * \param resource camp resource defining memory space where index data live
-   * \param owned optional enum value indicating whether segment owns indices 
+   * \param owned optional enum value indicating whether segment owns indices
    * (Owned or Unowned). Default is Owned.
    *
    * If 'Unowned' is passed as last argument, the segment will not own its
@@ -121,7 +120,7 @@ class TypedListSegment
                    Index_type length,
                    camp::resources::Resource resource,
                    IndexOwnership owned = Owned)
-    : m_resource(nullptr), m_owned(Unowned), m_data(nullptr), m_size(0)
+      : m_resource(nullptr), m_owned(Unowned), m_data(nullptr), m_size(0)
   {
     initIndexData(values, length, resource, owned);
   }
@@ -141,30 +140,34 @@ class TypedListSegment
   template <typename Container>
   TypedListSegment(const Container& container,
                    camp::resources::Resource resource)
-    : m_resource(nullptr), m_owned(Unowned), m_data(nullptr), m_size(container.size())
+      : m_resource(nullptr),
+        m_owned(Unowned),
+        m_data(nullptr),
+        m_size(container.size())
   {
-    if (m_size > 0) {
+    if (m_size > 0)
+    {
 
-      camp::resources::Resource host_res{camp::resources::Host()};
+      camp::resources::Resource host_res {camp::resources::Host()};
 
       value_type* tmp = host_res.allocate<value_type>(m_size);
 
-      auto dest = tmp;
-      auto src = container.begin();
+      auto dest      = tmp;
+      auto src       = container.begin();
       auto const end = container.end();
-      while (src != end) {
+      while (src != end)
+      {
         *dest = *src;
         ++dest;
         ++src;
       }
 
       m_resource = new camp::resources::Resource(resource);
-      m_data = m_resource->allocate<value_type>(m_size);
+      m_data     = m_resource->allocate<value_type>(m_size);
       m_resource->memcpy(m_data, tmp, sizeof(value_type) * m_size);
       m_owned = Owned;
 
       host_res.deallocate(tmp);
-
     }
   }
 
@@ -175,10 +178,11 @@ class TypedListSegment
   //  As this may be called from a lambda in a
   //  RAJA method we perform a shallow copy
   RAJA_HOST_DEVICE TypedListSegment(const TypedListSegment& other)
-    : m_resource(nullptr),
-      m_owned(Unowned), m_data(other.m_data), m_size(other.m_size)
-  {
-  }
+      : m_resource(nullptr),
+        m_owned(Unowned),
+        m_data(other.m_data),
+        m_size(other.m_size)
+  {}
 
   //! Copy assignment for list segment
   //  As this may be called from a lambda in a
@@ -187,59 +191,59 @@ class TypedListSegment
   {
     clear();
     m_resource = nullptr;
-    m_owned = Unowned;
-    m_data = other.m_data;
-    m_size = other.m_size;
+    m_owned    = Unowned;
+    m_data     = other.m_data;
+    m_size     = other.m_size;
   }
 
-    //! move assignment for list segment
+  //! move assignment for list segment
   //  As this may be called from a lambda in a
   //  RAJA method we perform a shallow copy
   RAJA_HOST_DEVICE TypedListSegment& operator=(TypedListSegment&& rhs)
   {
     clear();
     m_resource = rhs.m_resource;
-    m_owned = rhs.m_owned;
-    m_data = rhs.m_data;
-    m_size = rhs.m_size;
+    m_owned    = rhs.m_owned;
+    m_data     = rhs.m_data;
+    m_size     = rhs.m_size;
 
     rhs.m_resource = nullptr;
-    rhs.m_owned = Unowned;
-    rhs.m_data = nullptr;
-    rhs.m_size = 0;
+    rhs.m_owned    = Unowned;
+    rhs.m_data     = nullptr;
+    rhs.m_size     = 0;
   }
 
   //! Move constructor for list segment
   RAJA_HOST_DEVICE TypedListSegment(TypedListSegment&& rhs)
-    : m_resource(rhs.m_resource),
-      m_owned(rhs.m_owned), m_data(rhs.m_data), m_size(rhs.m_size)
+      : m_resource(rhs.m_resource),
+        m_owned(rhs.m_owned),
+        m_data(rhs.m_data),
+        m_size(rhs.m_size)
   {
-    rhs.m_owned = Unowned;
+    rhs.m_owned    = Unowned;
     rhs.m_resource = nullptr;
-    rhs.m_size = 0;
-    rhs.m_data = nullptr;
+    rhs.m_size     = 0;
+    rhs.m_data     = nullptr;
   }
 
   //! List segment destructor
-  RAJA_HOST_DEVICE ~TypedListSegment()
-  {
-    clear();
-  }
+  RAJA_HOST_DEVICE ~TypedListSegment() { clear(); }
 
   //! Clear method to be called
   RAJA_HOST_DEVICE void clear()
   {
 
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (m_data != nullptr && m_owned == Owned) {
+    if (m_data != nullptr && m_owned == Owned)
+    {
       m_resource->deallocate(m_data);
       delete m_resource;
     }
 #endif
-    m_data = nullptr;
+    m_data     = nullptr;
     m_resource = nullptr;
-    m_owned = Unowned;
-    m_size = 0;
+    m_owned    = Unowned;
+    m_size     = 0;
   }
 
   //@}
@@ -345,32 +349,35 @@ class TypedListSegment
   {
 
     // empty list segment
-    if (len <= 0 || container == nullptr) {
-      m_data = nullptr;
-      m_size = 0;
+    if (len <= 0 || container == nullptr)
+    {
+      m_data  = nullptr;
+      m_size  = 0;
       m_owned = Unowned;
       return;
     }
 
     // some non-zero size -- initialize accordingly
-    m_size = len;
+    m_size  = len;
     m_owned = container_own;
-    if (m_owned == Owned) {
+    if (m_owned == Owned)
+    {
 
-        m_resource = new camp::resources::Resource(resource_);
+      m_resource = new camp::resources::Resource(resource_);
 
-        camp::resources::Resource host_res{camp::resources::Host()};
+      camp::resources::Resource host_res {camp::resources::Host()};
 
-        value_type* tmp = host_res.allocate<value_type>(m_size);
+      value_type* tmp = host_res.allocate<value_type>(m_size);
 
-        for (Index_type i = 0; i < m_size; ++i) {
-          tmp[i] = container[i];
-        }
+      for (Index_type i = 0; i < m_size; ++i)
+      {
+        tmp[i] = container[i];
+      }
 
-        m_data = m_resource->allocate<value_type>(m_size);
-        m_resource->memcpy(m_data, tmp, sizeof(value_type) * m_size);
+      m_data = m_resource->allocate<value_type>(m_size);
+      m_resource->memcpy(m_data, tmp, sizeof(value_type) * m_size);
 
-        host_res.deallocate(tmp);
+      host_res.deallocate(tmp);
 
       return;
     }
@@ -382,7 +389,7 @@ class TypedListSegment
 
 
   // Copy of camp resource passed to ctor
-  camp::resources::Resource *m_resource;
+  camp::resources::Resource* m_resource;
 
   // Ownership flag to guide data copying/management
   IndexOwnership m_owned;
diff --git a/include/RAJA/index/RangeSegment.hpp b/include/RAJA/index/RangeSegment.hpp
index a41959c583..57fdb4c55e 100644
--- a/include/RAJA/index/RangeSegment.hpp
+++ b/include/RAJA/index/RangeSegment.hpp
@@ -50,10 +50,10 @@ namespace RAJA
  *
  * NOTE: TypedRangeSegment::iterator is a RandomAccessIterator
  *
- * NOTE: TypedRangeSegment supports negative indices; e.g., an interval of 
+ * NOTE: TypedRangeSegment supports negative indices; e.g., an interval of
  *       indices [-5, 3).
  *
- * NOTE: Proper handling of indices strides requires that StorageT is a 
+ * NOTE: Proper handling of indices strides requires that StorageT is a
  *       signed type.
  *
  * Usage:
@@ -92,15 +92,19 @@ namespace RAJA
  *
  ******************************************************************************
  */
-template <typename StorageT, typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
-struct TypedRangeSegment {
+template <typename StorageT,
+          typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
+struct TypedRangeSegment
+{
 
-  // 
+  //
   // Static asserts to provide some useful error messages during compilation
   // for incorrect usage.
-  // 
-  static_assert(std::is_signed<DiffT>::value, "TypedRangeSegment DiffT requires signed type.");
-  static_assert(!std::is_floating_point<StorageT>::value, "TypedRangeSegment Type must be non floating point.");
+  //
+  static_assert(std::is_signed<DiffT>::value,
+                "TypedRangeSegment DiffT requires signed type.");
+  static_assert(!std::is_floating_point<StorageT>::value,
+                "TypedRangeSegment Type must be non floating point.");
 
   //@{
   //!   @name Types used in implementation based on template parameters.
@@ -117,20 +121,19 @@ struct TypedRangeSegment {
   //@}
 
   //@{
-  //!   @name Constructors, destructor, and copy assignment. 
+  //!   @name Constructors, destructor, and copy assignment.
 
   /*!
    * \brief Construct a range segment repreenting the interval [begin, end)
-   * 
+   *
    * \param begin start value (inclusive) for the range
    * \param end end value (exclusive) for the range
    */
   using StripStorageT = strip_index_type_t<StorageT>;
-  RAJA_HOST_DEVICE constexpr TypedRangeSegment(StripStorageT begin, StripStorageT end)
-      : m_begin(iterator(begin)), 
-        m_end(begin > end ? m_begin : iterator(end))
-  {
-  }
+  RAJA_HOST_DEVICE constexpr TypedRangeSegment(StripStorageT begin,
+                                               StripStorageT end)
+      : m_begin(iterator(begin)), m_end(begin > end ? m_begin : iterator(end))
+  {}
 
   //! Disable compiler generated constructor
   RAJA_HOST_DEVICE TypedRangeSegment() = delete;
@@ -187,7 +190,7 @@ struct TypedRangeSegment {
    * \brief Compare this segment to another for inequality
    *
    * \return true if begin or end does not match, else false
-   */ 
+   */
   RAJA_HOST_DEVICE RAJA_INLINE bool operator!=(TypedRangeSegment const& o) const
   {
     return !(operator==(o));
@@ -198,9 +201,9 @@ struct TypedRangeSegment {
   /*!
    * \brief Get a new TypedRangeSegment instance representing a slice of
    *        existing segment
-   * 
-   * \param begin start iterate of new range 
-   * \param length maximum length of new range 
+   *
+   * \param begin start iterate of new range
+   * \param length maximum length of new range
    * \return TypedRangeSegment representing the interval
    *         [ *begin() + begin, min( *begin() + begin + length, *end() ) )
    *
@@ -213,7 +216,7 @@ struct TypedRangeSegment {
    *     auto r = RAJA::TypedRangeSegment<int>(-4, 4);
    *
    *     // s repreents the subinterval  [-3, 2)
-   *     auto s = r.slice(1, 5); 
+   *     auto s = r.slice(1, 5);
    *
    *   \endverbatim
    */
@@ -221,9 +224,9 @@ struct TypedRangeSegment {
                                                        DiffT length) const
   {
     StorageT start = m_begin[0] + begin;
-    StorageT end = start + length > m_end[0] ? m_end[0] : start + length;
+    StorageT end   = start + length > m_end[0] ? m_end[0] : start + length;
 
-    return TypedRangeSegment{stripIndexType(start), stripIndexType(end)};
+    return TypedRangeSegment {stripIndexType(start), stripIndexType(end)};
   }
 
   /*!
@@ -247,8 +250,8 @@ struct TypedRangeSegment {
 /*!
  ******************************************************************************
  *
- * \class TypedRangeStrideSegment 
- * 
+ * \class TypedRangeStrideSegment
+ *
  * \brief  Segment class representing a strided range of typed indices
  *
  * \tparam StorageT underlying data type for the segment indices (required)
@@ -264,9 +267,9 @@ struct TypedRangeSegment {
  *
  * NOTE: TypedRangeStrideSegment::iterator is a RandomAccessIterator
  *
- * NOTE: TypedRangeStrideSegment allows for positive or negative strides and 
- *       indices. This allows for forward (stride > 0) or backward (stride < 0) 
- *       traversal of the iteration space. A stride of zero is undefined and 
+ * NOTE: TypedRangeStrideSegment allows for positive or negative strides and
+ *       indices. This allows for forward (stride > 0) or backward (stride < 0)
+ *       traversal of the iteration space. A stride of zero is undefined and
  *       will cause divide-by-zero errors.
  *
  * As with RangeSegment, the iteration space is inclusive of begin() and
@@ -275,7 +278,7 @@ struct TypedRangeSegment {
  * For positive strides, begin() > end() implies size()==0
  * For negative strides, begin() < end() implies size()==0
  *
- * NOTE: Proper handling of negative strides and indices requires that 
+ * NOTE: Proper handling of negative strides and indices requires that
  *       StorageT is a signed type.
  *
  * Usage:
@@ -321,15 +324,19 @@ struct TypedRangeSegment {
  *
  ******************************************************************************
  */
-template <typename StorageT, typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
-struct TypedRangeStrideSegment {
+template <typename StorageT,
+          typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
+struct TypedRangeStrideSegment
+{
 
   //
   // Static asserts to provide some useful error messages during compilation
   // for incorrect usage.
   //
-  static_assert(std::is_signed<DiffT>::value, "TypedRangeStrideSegment DiffT requires signed type.");
-  static_assert(!std::is_floating_point<StorageT>::value, "TypedRangeStrideSegment Type must be non floating point.");
+  static_assert(std::is_signed<DiffT>::value,
+                "TypedRangeStrideSegment DiffT requires signed type.");
+  static_assert(!std::is_floating_point<StorageT>::value,
+                "TypedRangeStrideSegment Type must be non floating point.");
 
   //@{
   //!   @name Types used in implementation based on template parameters.
@@ -349,7 +356,7 @@ struct TypedRangeStrideSegment {
   //!   @name Constructors, destructor, and copy assignment.
 
   /*!
-   * \brief Construct a range segment for the interval [begin, end) with 
+   * \brief Construct a range segment for the interval [begin, end) with
    *        given stride
    *
    * \param begin start value (inclusive) for the range
@@ -357,9 +364,8 @@ struct TypedRangeStrideSegment {
    * \param stride stride value when iterating over the range
    */
   using StripStorageT = strip_index_type_t<StorageT>;
-  RAJA_HOST_DEVICE TypedRangeStrideSegment(StripStorageT begin,
-                                           StripStorageT end,
-                                           DiffT stride)
+  RAJA_HOST_DEVICE
+  TypedRangeStrideSegment(StripStorageT begin, StripStorageT end, DiffT stride)
       : m_begin(iterator(begin, stride)),
         m_end(iterator(end, stride)),
         // essentially a ceil((end-begin)/stride) but using integer math,
@@ -367,13 +373,16 @@ struct TypedRangeStrideSegment {
         m_size((end - begin + stride - (stride > 0 ? 1 : -1)) / stride)
   {
     // clamp range when end is unreachable from begin without wrapping
-    if (stride < 0 && end > begin) {
+    if (stride < 0 && end > begin)
+    {
       m_end = m_begin;
-    } else if (stride > 0 && end < begin) {
+    }
+    else if (stride > 0 && end < begin)
+    {
       m_end = m_begin;
     }
     // m_size initialized as negative indicates a zero iteration space
-    m_size = m_size < DiffT{0} ? DiffT{0} : m_size;
+    m_size = m_size < DiffT {0} ? DiffT {0} : m_size;
   }
 
   //! Disable compiler generated constructor
@@ -408,8 +417,8 @@ struct TypedRangeStrideSegment {
 
   /*!
    * \brief Get size of this segment
-   * 
-   * The size is the number of iterates in the 
+   *
+   * The size is the number of iterates in the
    * interval [begin, end) when striding over it
    */
   RAJA_HOST_DEVICE DiffT size() const { return m_size; }
@@ -435,7 +444,8 @@ struct TypedRangeStrideSegment {
    *
    * \return true if begin, end, or size does not match, else false
    */
-  RAJA_HOST_DEVICE RAJA_INLINE bool operator!=(TypedRangeStrideSegment const& o) const
+  RAJA_HOST_DEVICE RAJA_INLINE bool
+  operator!=(TypedRangeStrideSegment const& o) const
   {
     return !(operator==(o));
   }
@@ -450,7 +460,7 @@ struct TypedRangeStrideSegment {
    * \param length maximum length of new range
    *
    * \return TypedRangeStrideSegment representing the interval
-   *         [ *begin() + begin * stride, 
+   *         [ *begin() + begin * stride,
    *           min( *begin() + (begin + length) * stride, *end() )
    *
    * Here's an example of a slice operation on a range segment with a negative
@@ -466,24 +476,26 @@ struct TypedRangeStrideSegment {
    *     //       5 indices in r starting at the 6th entry
    *     auto s = r.slice(6, 6);
    *
-   *   \endverbatim 
+   *   \endverbatim
    */
   RAJA_HOST_DEVICE TypedRangeStrideSegment slice(StorageT begin,
                                                  DiffT length) const
   {
     StorageT stride = m_begin.get_stride();
-    StorageT start = m_begin[0] + begin * stride;
-    StorageT end = start + stride * length;
+    StorageT start  = m_begin[0] + begin * stride;
+    StorageT end    = start + stride * length;
 
-    if (stride > 0) {
+    if (stride > 0)
+    {
       end = end > m_end[0] ? m_end[0] : end;
-    } else {
+    }
+    else
+    {
       end = end < m_end[0] ? m_end[0] : end;
     }
 
-    return TypedRangeStrideSegment{stripIndexType(start),
-                                   stripIndexType(end),
-                                   m_begin.get_stride()};
+    return TypedRangeStrideSegment {stripIndexType(start), stripIndexType(end),
+                                    m_begin.get_stride()};
   }
 
   /*!
@@ -518,11 +530,12 @@ namespace detail
 
 template <typename T, typename... Rest>
 struct common_type
-    : std::common_type<T, typename std::common_type<Rest...>::type> {
-};
+    : std::common_type<T, typename std::common_type<Rest...>::type>
+{};
 
 template <typename T>
-struct common_type<T> {
+struct common_type<T>
+{
   using type = T;
 };
 
@@ -549,7 +562,7 @@ RAJA_HOST_DEVICE TypedRangeSegment<Common> make_range(BeginT&& begin,
 }
 
 /*!
- * \brief Function to make a TypedRangeStride Segment for the interval 
+ * \brief Function to make a TypedRangeStride Segment for the interval
  *        [begin, end) with given stride
  *
  *  \return a newly constructed TypedRangeStrideSegment where
@@ -561,13 +574,14 @@ template <typename BeginT,
           typename EndT,
           typename StrideT,
           typename Common = detail::common_type_t<BeginT, EndT>>
-RAJA_HOST_DEVICE TypedRangeStrideSegment<Common> make_strided_range(
-    BeginT&& begin,
-    EndT&& end,
-    StrideT&& stride)
+RAJA_HOST_DEVICE TypedRangeStrideSegment<Common>
+make_strided_range(BeginT&& begin, EndT&& end, StrideT&& stride)
 {
-  static_assert(std::is_signed<StrideT>::value, "make_strided_segment : stride must be signed.");
-  static_assert(std::is_same<make_signed_t<EndT>, StrideT>::value, "make_stride_segment : stride and end must be of similar types.");
+  static_assert(std::is_signed<StrideT>::value,
+                "make_strided_segment : stride must be signed.");
+  static_assert(
+      std::is_same<make_signed_t<EndT>, StrideT>::value,
+      "make_stride_segment : stride and end must be of similar types.");
   return {begin, end, stride};
 }
 
@@ -576,13 +590,13 @@ namespace concepts
 
 template <typename T, typename U>
 struct RangeConstructible
-    : DefineConcept(camp::val<RAJA::detail::common_type_t<T, U>>()) {
-};
+    : DefineConcept(camp::val<RAJA::detail::common_type_t<T, U>>())
+{};
 
 template <typename T, typename U, typename V>
 struct RangeStrideConstructible
-    : DefineConcept(camp::val<RAJA::detail::common_type_t<T, U, V>>()) {
-};
+    : DefineConcept(camp::val<RAJA::detail::common_type_t<T, U, V>>())
+{};
 
 }  // namespace concepts
 
diff --git a/include/RAJA/internal/DepGraphNode.hpp b/include/RAJA/internal/DepGraphNode.hpp
index 8feceae22f..d2a30ee5ce 100644
--- a/include/RAJA/internal/DepGraphNode.hpp
+++ b/include/RAJA/internal/DepGraphNode.hpp
@@ -57,8 +57,7 @@ class RAJA_ALIGNED_ATTR(256) DepGraphNode
   ///
   DepGraphNode()
       : m_num_dep_tasks(0), m_semaphore_reload_value(0), m_semaphore_value(0)
-  {
-  }
+  {}
 
   ///
   /// Get/set semaphore value; i.e., the current number of (unsatisfied)
@@ -82,7 +81,8 @@ class RAJA_ALIGNED_ATTR(256) DepGraphNode
   ///
   void satisfyOne()
   {
-    if (m_semaphore_value > 0) {
+    if (m_semaphore_value > 0)
+    {
       --m_semaphore_value;
     }
   }
@@ -92,7 +92,8 @@ class RAJA_ALIGNED_ATTR(256) DepGraphNode
   ///
   void wait()
   {
-    while (m_semaphore_value > 0) {
+    while (m_semaphore_value > 0)
+    {
       // TODO: an efficient wait would be better here, but the standard
       // promise/future is not good enough
       std::this_thread::yield();
diff --git a/include/RAJA/internal/Iterators.hpp b/include/RAJA/internal/Iterators.hpp
index 6f32a56e6d..33cdd3f539 100644
--- a/include/RAJA/internal/Iterators.hpp
+++ b/include/RAJA/internal/Iterators.hpp
@@ -50,7 +50,8 @@ std::string overflow_msg(LType lhs, RType rhs)
 template <typename Type, typename DifferenceType>
 RAJA_HOST_DEVICE bool is_addition_overflow(Type lhs, DifferenceType rhs)
 {
-  if (std::is_unsigned<Type>::value) {
+  if (std::is_unsigned<Type>::value)
+  {
     if ((rhs > 0) && (lhs > std::numeric_limits<Type>::max() - rhs))
       return true;
     if ((rhs < 0) && (lhs < std::numeric_limits<Type>::min() - rhs))
@@ -64,18 +65,22 @@ RAJA_HOST_DEVICE bool is_subtraction_overflow(Type lhs,
                                               DifferenceType rhs,
                                               bool iterator_on_left = true)
 {
-  if (iterator_on_left) {
+  if (iterator_on_left)
+  {
 
-    if (std::is_unsigned<Type>::value) {
+    if (std::is_unsigned<Type>::value)
+    {
       if ((rhs > 0) && (lhs < std::numeric_limits<Type>::min() + rhs))
         return true;
       if ((rhs < 0) && (lhs > std::numeric_limits<Type>::max() + rhs))
         return true;
     }
+  }
+  else
+  {  // Special case where operation is : value(lhs) - iterator(rhs).
 
-  } else {  // Special case where operation is : value(lhs) - iterator(rhs).
-
-    if (std::is_unsigned<DifferenceType>::value) {
+    if (std::is_unsigned<DifferenceType>::value)
+    {
       if ((lhs > 0) && (rhs < std::numeric_limits<DifferenceType>::min() + lhs))
         return true;
       if ((lhs < 0)) return true;
@@ -100,29 +105,28 @@ RAJA_HOST_DEVICE void check_is_subtraction_overflow(Type lhs,
 }
 #endif
 
-template <typename Type = Index_type,
+template <typename Type           = Index_type,
           typename DifferenceType = Type,
-          typename PointerType = Type*>
+          typename PointerType    = Type*>
 class numeric_iterator
 {
 public:
-  using value_type = Type;
+  using value_type          = Type;
   using stripped_value_type = strip_index_type_t<Type>;
-  using difference_type = DifferenceType;
-  using pointer = PointerType;
-  using reference = value_type&;
-  using iterator_category = std::random_access_iterator_tag;
-
-  constexpr numeric_iterator() noexcept = default;
-  constexpr numeric_iterator(const numeric_iterator&) noexcept = default;
-  constexpr numeric_iterator(numeric_iterator&&) noexcept = default;
+  using difference_type     = DifferenceType;
+  using pointer             = PointerType;
+  using reference           = value_type&;
+  using iterator_category   = std::random_access_iterator_tag;
+
+  constexpr numeric_iterator() noexcept                         = default;
+  constexpr numeric_iterator(const numeric_iterator&) noexcept  = default;
+  constexpr numeric_iterator(numeric_iterator&&) noexcept       = default;
   numeric_iterator& operator=(const numeric_iterator&) noexcept = default;
-  numeric_iterator& operator=(numeric_iterator&&) noexcept = default;
+  numeric_iterator& operator=(numeric_iterator&&) noexcept      = default;
 
   RAJA_HOST_DEVICE constexpr numeric_iterator(const stripped_value_type& rhs)
       : val(rhs)
-  {
-  }
+  {}
 
   RAJA_HOST_DEVICE inline DifferenceType get_stride() const { return 1; }
 
@@ -174,8 +178,8 @@ class numeric_iterator
     return tmp;
   }
 
-  RAJA_HOST_DEVICE inline numeric_iterator& operator+=(
-      const difference_type& rhs)
+  RAJA_HOST_DEVICE inline numeric_iterator&
+  operator+=(const difference_type& rhs)
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_addition_overflow(val, rhs);
@@ -183,8 +187,8 @@ class numeric_iterator
     val += rhs;
     return *this;
   }
-  RAJA_HOST_DEVICE inline numeric_iterator& operator-=(
-      const difference_type& rhs)
+  RAJA_HOST_DEVICE inline numeric_iterator&
+  operator-=(const difference_type& rhs)
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_subtraction_overflow(val, rhs);
@@ -192,48 +196,47 @@ class numeric_iterator
     val -= rhs;
     return *this;
   }
-  RAJA_HOST_DEVICE inline numeric_iterator& operator+=(
-      const numeric_iterator& rhs)
+  RAJA_HOST_DEVICE inline numeric_iterator&
+  operator+=(const numeric_iterator& rhs)
   {
     val += rhs.val;
     return *this;
   }
-  RAJA_HOST_DEVICE inline numeric_iterator& operator-=(
-      const numeric_iterator& rhs)
+  RAJA_HOST_DEVICE inline numeric_iterator&
+  operator-=(const numeric_iterator& rhs)
   {
     val -= rhs.val;
     return *this;
   }
 
-  RAJA_HOST_DEVICE inline stripped_value_type operator+(
-      const numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline stripped_value_type
+  operator+(const numeric_iterator& rhs) const
   {
     return val + rhs.val;
   }
-  RAJA_HOST_DEVICE inline stripped_value_type operator-(
-      const numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline stripped_value_type
+  operator-(const numeric_iterator& rhs) const
   {
     return val - rhs.val;
   }
-  RAJA_HOST_DEVICE inline numeric_iterator operator+(
-      const difference_type& rhs) const
+  RAJA_HOST_DEVICE inline numeric_iterator
+  operator+(const difference_type& rhs) const
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_addition_overflow(val, rhs);
 #endif
     return numeric_iterator(val + rhs);
   }
-  RAJA_HOST_DEVICE inline numeric_iterator operator-(
-      const difference_type& rhs) const
+  RAJA_HOST_DEVICE inline numeric_iterator
+  operator-(const difference_type& rhs) const
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_subtraction_overflow(val, rhs);
 #endif
     return numeric_iterator(val - rhs);
   }
-  RAJA_HOST_DEVICE friend constexpr numeric_iterator operator+(
-      difference_type lhs,
-      const numeric_iterator& rhs)
+  RAJA_HOST_DEVICE friend constexpr numeric_iterator
+  operator+(difference_type lhs, const numeric_iterator& rhs)
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     return is_addition_overflow(rhs.val, lhs)
@@ -243,9 +246,8 @@ class numeric_iterator
     return numeric_iterator(lhs + rhs.val);
 #endif
   }
-  RAJA_HOST_DEVICE friend constexpr numeric_iterator operator-(
-      difference_type lhs,
-      const numeric_iterator& rhs)
+  RAJA_HOST_DEVICE friend constexpr numeric_iterator
+  operator-(difference_type lhs, const numeric_iterator& rhs)
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     return is_subtraction_overflow(rhs.val, lhs, false)
@@ -273,31 +275,34 @@ class numeric_iterator
   stripped_value_type val = 0;
 };
 
-template <typename Type = Index_type,
+template <typename Type           = Index_type,
           typename DifferenceType = Type,
-          typename PointerType = Type*>
+          typename PointerType    = Type*>
 class strided_numeric_iterator
 {
 public:
-  using value_type = Type;
+  using value_type          = Type;
   using stripped_value_type = strip_index_type_t<Type>;
-  using difference_type = DifferenceType;
-  using pointer = DifferenceType*;
-  using reference = DifferenceType&;
-  using iterator_category = std::random_access_iterator_tag;
+  using difference_type     = DifferenceType;
+  using pointer             = DifferenceType*;
+  using reference           = DifferenceType&;
+  using iterator_category   = std::random_access_iterator_tag;
 
   constexpr strided_numeric_iterator() noexcept = default;
-  constexpr strided_numeric_iterator(const strided_numeric_iterator&) noexcept = default;
-  constexpr strided_numeric_iterator(strided_numeric_iterator&&) noexcept = default;
-  strided_numeric_iterator& operator=(const strided_numeric_iterator&) noexcept = default;
-  strided_numeric_iterator& operator=(strided_numeric_iterator&&) noexcept = default;
+  constexpr strided_numeric_iterator(const strided_numeric_iterator&) noexcept =
+      default;
+  constexpr strided_numeric_iterator(strided_numeric_iterator&&) noexcept =
+      default;
+  strided_numeric_iterator&
+  operator=(const strided_numeric_iterator&) noexcept = default;
+  strided_numeric_iterator&
+  operator=(strided_numeric_iterator&&) noexcept = default;
 
   RAJA_HOST_DEVICE constexpr strided_numeric_iterator(
       stripped_value_type rhs,
       DifferenceType stride_ = DifferenceType(1))
       : val(rhs), stride(stride_)
-  {
-  }
+  {}
 
   RAJA_HOST_DEVICE inline DifferenceType get_stride() const { return stride; }
 
@@ -312,8 +317,8 @@ class strided_numeric_iterator
     return *this;
   }
 
-  RAJA_HOST_DEVICE inline strided_numeric_iterator& operator+=(
-      const difference_type& rhs)
+  RAJA_HOST_DEVICE inline strided_numeric_iterator&
+  operator+=(const difference_type& rhs)
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_addition_overflow(val, rhs * stride);
@@ -321,8 +326,8 @@ class strided_numeric_iterator
     val += rhs * stride;
     return *this;
   }
-  RAJA_HOST_DEVICE inline strided_numeric_iterator& operator-=(
-      const difference_type& rhs)
+  RAJA_HOST_DEVICE inline strided_numeric_iterator&
+  operator-=(const difference_type& rhs)
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_subtraction_overflow(val, rhs * stride);
@@ -331,33 +336,33 @@ class strided_numeric_iterator
     return *this;
   }
 
-  RAJA_HOST_DEVICE inline difference_type operator+(
-      const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline difference_type
+  operator+(const strided_numeric_iterator& rhs) const
   {
     return (static_cast<difference_type>(val) +
             (static_cast<difference_type>(rhs.val))) /
            stride;
   }
-  RAJA_HOST_DEVICE inline difference_type operator-(
-      const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline difference_type
+  operator-(const strided_numeric_iterator& rhs) const
   {
     difference_type diff = (static_cast<difference_type>(val) -
                             (static_cast<difference_type>(rhs.val)));
 
-    return (diff % stride != difference_type{0})
-               ? (difference_type{1} + diff / stride)
+    return (diff % stride != difference_type {0})
+               ? (difference_type {1} + diff / stride)
                : diff / stride;
   }
-  RAJA_HOST_DEVICE inline strided_numeric_iterator operator+(
-      const difference_type& rhs) const
+  RAJA_HOST_DEVICE inline strided_numeric_iterator
+  operator+(const difference_type& rhs) const
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_addition_overflow(val, rhs * stride);
 #endif
     return strided_numeric_iterator(val + rhs * stride, stride);
   }
-  RAJA_HOST_DEVICE inline strided_numeric_iterator operator-(
-      const difference_type& rhs) const
+  RAJA_HOST_DEVICE inline strided_numeric_iterator
+  operator-(const difference_type& rhs) const
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_subtraction_overflow(val, rhs * stride);
@@ -367,34 +372,34 @@ class strided_numeric_iterator
 
   // Specialized comparison to allow normal iteration to work on off-stride
   // multiples by adjusting rhs to the nearest *higher* multiple of stride
-  RAJA_HOST_DEVICE inline bool operator!=(
-      const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline bool
+  operator!=(const strided_numeric_iterator& rhs) const
   {
     return (val - rhs.val) / stride;
   }
-  RAJA_HOST_DEVICE inline bool operator==(
-      const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline bool
+  operator==(const strided_numeric_iterator& rhs) const
   {
     return !((val - rhs.val) / stride);
   }
 
-  RAJA_HOST_DEVICE inline bool operator>(
-      const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline bool
+  operator>(const strided_numeric_iterator& rhs) const
   {
     return val * stride > rhs.val * stride;
   }
-  RAJA_HOST_DEVICE inline bool operator<(
-      const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline bool
+  operator<(const strided_numeric_iterator& rhs) const
   {
     return val * stride < rhs.val * stride;
   }
-  RAJA_HOST_DEVICE inline bool operator>=(
-      const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline bool
+  operator>=(const strided_numeric_iterator& rhs) const
   {
     return val * stride >= rhs.val * stride;
   }
-  RAJA_HOST_DEVICE inline bool operator<=(
-      const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline bool
+  operator<=(const strided_numeric_iterator& rhs) const
   {
     return val * stride <= rhs.val * stride;
   }
@@ -415,7 +420,7 @@ class strided_numeric_iterator
 
 private:
   stripped_value_type val = 0;
-  DifferenceType stride = 1;
+  DifferenceType stride   = 1;
 };
 
 
diff --git a/include/RAJA/internal/MemUtils_CPU.hpp b/include/RAJA/internal/MemUtils_CPU.hpp
index 55015f9ab7..a7dee5a77c 100644
--- a/include/RAJA/internal/MemUtils_CPU.hpp
+++ b/include/RAJA/internal/MemUtils_CPU.hpp
@@ -27,7 +27,7 @@
 
 #include "RAJA/util/types.hpp"
 
-#if defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) || \
+#if defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) ||                \
     defined(__MINGW32__) || defined(__BORLANDC__)
 #define RAJA_PLATFORM_WINDOWS
 #include <malloc.h>
@@ -44,7 +44,7 @@ inline void* allocate_aligned(size_t alignment, size_t size)
 #if defined(RAJA_HAVE_POSIX_MEMALIGN)
   // posix_memalign available
   void* ret = nullptr;
-  int err = posix_memalign(&ret, alignment, size);
+  int err   = posix_memalign(&ret, alignment, size);
   return err ? nullptr : ret;
 #elif defined(RAJA_HAVE_ALIGNED_ALLOC)
   return std::aligned_alloc(alignment, size);
@@ -53,10 +53,10 @@ inline void* allocate_aligned(size_t alignment, size_t size)
 #elif defined(RAJA_PLATFORM_WINDOWS)
   return _aligned_malloc(size, alignment);
 #else
-  char *mem = (char *)malloc(size + alignment + sizeof(void *));
+  char* mem = (char*)malloc(size + alignment + sizeof(void*));
   if (nullptr == mem) return nullptr;
-  void **ptr = (void **)((std::uintptr_t)(mem + alignment + sizeof(void *)) &
-                         ~(alignment - 1));
+  void** ptr = (void**)((std::uintptr_t)(mem + alignment + sizeof(void*)) &
+                        ~(alignment - 1));
   // Store the original address one position behind what we give the user.
   ptr[-1] = mem;
   return ptr;
@@ -97,25 +97,23 @@ inline void free_aligned(void* ptr)
 ///
 struct FreeAligned
 {
-  void operator()(void* ptr)
-  {
-    free_aligned(ptr);
-  }
+  void operator()(void* ptr) { free_aligned(ptr); }
 };
 
 ///
 /// Deleter function object for memory allocated with allocate_aligned_type
 /// that calls the destructor for the fist size objects in the storage.
 ///
-template < typename T, typename index_type >
+template <typename T, typename index_type>
 struct FreeAlignedType : FreeAligned
 {
   index_type size = 0;
 
   void operator()(T* ptr)
   {
-    for ( index_type i = size; i > 0; --i ) {
-      ptr[i-1].~T();
+    for (index_type i = size; i > 0; --i)
+    {
+      ptr[i - 1].~T();
     }
     FreeAligned::operator()(ptr);
   }
diff --git a/include/RAJA/internal/RAJAVec.hpp b/include/RAJA/internal/RAJAVec.hpp
index 1d0ec0cbeb..7802bda6cd 100644
--- a/include/RAJA/internal/RAJAVec.hpp
+++ b/include/RAJA/internal/RAJAVec.hpp
@@ -49,7 +49,7 @@ namespace RAJA
  *
  ******************************************************************************
  */
-template <typename T, typename Allocator = std::allocator<T> >
+template <typename T, typename Allocator = std::allocator<T>>
 class RAJAVec
 {
   using allocator_traits_type = std::allocator_traits<Allocator>;
@@ -57,24 +57,25 @@ class RAJAVec
       typename allocator_traits_type::propagate_on_container_copy_assignment;
   using propagate_on_container_move_assignment =
       typename allocator_traits_type::propagate_on_container_move_assignment;
-  using propagate_on_container_swap            =
+  using propagate_on_container_swap =
       typename allocator_traits_type::propagate_on_container_swap;
+
 public:
-  using value_type = T;
-  using allocator_type = Allocator;
-  using size_type = std::size_t;
+  using value_type      = T;
+  using allocator_type  = Allocator;
+  using size_type       = std::size_t;
   using difference_type = std::ptrdiff_t;
-  using reference = value_type&;
+  using reference       = value_type&;
   using const_reference = const value_type&;
-  using pointer = typename allocator_traits_type::pointer;
-  using const_pointer = typename allocator_traits_type::const_pointer;
-  using iterator = value_type*;
-  using const_iterator = const value_type*;
+  using pointer         = typename allocator_traits_type::pointer;
+  using const_pointer   = typename allocator_traits_type::const_pointer;
+  using iterator        = value_type*;
+  using const_iterator  = const value_type*;
 
   ///
   /// Construct empty vector with given capacity.
   ///
-  explicit RAJAVec(size_type init_cap = 0,
+  explicit RAJAVec(size_type init_cap      = 0,
                    const allocator_type& a = allocator_type())
       : m_data(nullptr), m_allocator(a), m_capacity(0), m_size(0)
   {
@@ -86,7 +87,9 @@ class RAJAVec
   ///
   RAJAVec(const RAJAVec& other)
       : m_data(nullptr),
-        m_allocator(allocator_traits_type::select_on_container_copy_construction(other.m_allocator)),
+        m_allocator(
+            allocator_traits_type::select_on_container_copy_construction(
+                other.m_allocator)),
         m_capacity(0),
         m_size(0)
   {
@@ -103,9 +106,9 @@ class RAJAVec
         m_capacity(other.m_capacity),
         m_size(other.m_size)
   {
-    other.m_data = nullptr;
+    other.m_data     = nullptr;
     other.m_capacity = 0;
-    other.m_size = 0;
+    other.m_size     = 0;
   }
 
   ///
@@ -113,8 +116,9 @@ class RAJAVec
   ///
   RAJAVec& operator=(const RAJAVec& rhs)
   {
-    if (&rhs != this) {
-      copy_assign_private(rhs, propagate_on_container_copy_assignment{});
+    if (&rhs != this)
+    {
+      copy_assign_private(rhs, propagate_on_container_copy_assignment {});
     }
     return *this;
   }
@@ -124,8 +128,10 @@ class RAJAVec
   ///
   RAJAVec& operator=(RAJAVec&& rhs)
   {
-    if (&rhs != this) {
-      move_assign_private(std::move(rhs), propagate_on_container_move_assignment{});
+    if (&rhs != this)
+    {
+      move_assign_private(std::move(rhs),
+                          propagate_on_container_move_assignment {});
     }
     return *this;
   }
@@ -144,31 +150,31 @@ class RAJAVec
   ///
   void swap(RAJAVec& other)
   {
-    swap_private(other, propagate_on_container_swap{});
+    swap_private(other, propagate_on_container_swap {});
   }
 
   ///
   /// Get a pointer to the beginning of the contiguous vector
   ///
-        pointer data()       { return m_data; }
+  pointer data() { return m_data; }
   ///
   const_pointer data() const { return m_data; }
 
   ///
   /// Get an iterator to the end.
   ///
-        iterator  end()       { return m_data + m_size; }
+  iterator end() { return m_data + m_size; }
   ///
-  const_iterator  end() const { return m_data + m_size; }
+  const_iterator end() const { return m_data + m_size; }
   ///
   const_iterator cend() const { return m_data + m_size; }
 
   ///
   /// Get an iterator to the beginning.
   ///
-        iterator  begin()       { return m_data; }
+  iterator begin() { return m_data; }
   ///
-  const_iterator  begin() const { return m_data; }
+  const_iterator begin() const { return m_data; }
   ///
   const_iterator cbegin() const { return m_data; }
 
@@ -200,18 +206,12 @@ class RAJAVec
   ///
   /// Shrink the capacity of the vector to the current size.
   ///
-  void shrink_to_fit()
-  {
-    shrink_cap(m_size);
-  }
+  void shrink_to_fit() { shrink_cap(m_size); }
 
   ///
   /// Empty vector of all data.
   ///
-  void clear()
-  {
-    destroy_items_after(0);
-  }
+  void clear() { destroy_items_after(0); }
 
   ///
   /// Change the size of the vector,
@@ -221,10 +221,13 @@ class RAJAVec
   RAJA_INLINE
   void resize(size_type new_size)
   {
-    if (new_size >= size()) {
+    if (new_size >= size())
+    {
       reserve(new_size);
       construct_items_back(new_size);
-    } else {
+    }
+    else
+    {
       destroy_items_after(new_size);
     }
   }
@@ -237,10 +240,13 @@ class RAJAVec
   RAJA_INLINE
   void resize(size_type new_size, const_reference new_value)
   {
-    if (new_size >= size()) {
+    if (new_size >= size())
+    {
       reserve(new_size);
       construct_items_back(new_size, new_value);
-    } else {
+    }
+    else
+    {
       destroy_items_after(new_size);
     }
   }
@@ -248,23 +254,23 @@ class RAJAVec
   ///
   /// Bracket operator accessor.
   ///
-        reference operator[](difference_type i)       { return m_data[i]; }
+  reference operator[](difference_type i) { return m_data[i]; }
   ///
   const_reference operator[](difference_type i) const { return m_data[i]; }
 
   ///
   /// Access the last item of the vector.
   ///
-        reference front()       { return m_data[0]; }
+  reference front() { return m_data[0]; }
   ///
   const_reference front() const { return m_data[0]; }
 
   ///
   /// Access the last item of the vector.
   ///
-        reference back()       { return m_data[m_size-1]; }
+  reference back() { return m_data[m_size - 1]; }
   ///
-  const_reference back() const { return m_data[m_size-1]; }
+  const_reference back() const { return m_data[m_size - 1]; }
 
   ///
   /// Add item to front end of vector. Note that this operation is unique to
@@ -272,28 +278,31 @@ class RAJAVec
   ///
   void push_front(const_reference item) { emplace_front_private(item); }
   ///
-  void push_front(   value_type&& item) { emplace_front_private(std::move(item)); }
+  void push_front(value_type&& item) { emplace_front_private(std::move(item)); }
   ///
-  template < typename ... Os >
-  void emplace_front(Os&&... os) { emplace_front_private(std::forward<Os>(os)...); }
+  template <typename... Os>
+  void emplace_front(Os&&... os)
+  {
+    emplace_front_private(std::forward<Os>(os)...);
+  }
 
   ///
   /// Add item to back end of vector.
   ///
   void push_back(const_reference item) { emplace_back_private(item); }
   ///
-  void push_back(   value_type&& item) { emplace_back_private(std::move(item)); }
+  void push_back(value_type&& item) { emplace_back_private(std::move(item)); }
   ///
-  template < typename ... Os >
-  void emplace_back(Os&&... os) { emplace_back_private(std::forward<Os>(os)...); }
+  template <typename... Os>
+  void emplace_back(Os&&... os)
+  {
+    emplace_back_private(std::forward<Os>(os)...);
+  }
 
   ///
   /// Remove the last item of the vector.
   ///
-  void pop_back()
-  {
-    destroy_items_after(m_size-1);
-  }
+  void pop_back() { destroy_items_after(m_size - 1); }
 
 private:
   pointer m_data;
@@ -307,13 +316,14 @@ class RAJAVec
   ///
   void copy_assign_private(RAJAVec const& rhs, std::true_type)
   {
-    if (m_allocator != rhs.m_allocator) {
+    if (m_allocator != rhs.m_allocator)
+    {
       clear();
       shrink_to_fit();
       m_allocator = rhs.m_allocator;
     }
 
-    copy_assign_private(rhs, std::false_type{});
+    copy_assign_private(rhs, std::false_type {});
   }
 
   ///
@@ -323,10 +333,13 @@ class RAJAVec
   void copy_assign_private(RAJAVec const& rhs, std::false_type)
   {
     reserve(rhs.size());
-    if (size() < rhs.size()) {
+    if (size() < rhs.size())
+    {
       copy_assign_items(0, size(), rhs.data());
       copy_construct_items_back(rhs.size(), rhs.data());
-    } else {
+    }
+    else
+    {
       copy_assign_items(0, rhs.size(), rhs.data());
       destroy_items_after(size());
     }
@@ -341,14 +354,14 @@ class RAJAVec
     clear();
     shrink_to_fit();
 
-    m_data = rhs.m_data;
+    m_data      = rhs.m_data;
     m_allocator = std::move(rhs.m_allocator);
-    m_capacity = rhs.m_capacity;
-    m_size = rhs.m_size;
+    m_capacity  = rhs.m_capacity;
+    m_size      = rhs.m_size;
 
-    rhs.m_data = nullptr;
+    rhs.m_data     = nullptr;
     rhs.m_capacity = 0;
-    rhs.m_size = 0;
+    rhs.m_size     = 0;
   }
 
   ///
@@ -357,23 +370,29 @@ class RAJAVec
   ///
   void move_assign_private(RAJAVec&& rhs, std::false_type)
   {
-    if (m_allocator == rhs.m_allocator) {
+    if (m_allocator == rhs.m_allocator)
+    {
       clear();
       shrink_to_fit();
 
-      m_data = rhs.m_data;
+      m_data     = rhs.m_data;
       m_capacity = rhs.m_capacity;
-      m_size = rhs.m_size;
+      m_size     = rhs.m_size;
 
-      rhs.m_data = nullptr;
+      rhs.m_data     = nullptr;
       rhs.m_capacity = 0;
-      rhs.m_size = 0;
-    } else {
+      rhs.m_size     = 0;
+    }
+    else
+    {
       reserve(rhs.size());
-      if (size() < rhs.size()) {
+      if (size() < rhs.size())
+      {
         move_assign_items(0, size(), rhs.data());
         move_construct_items_back(rhs.size(), rhs.data());
-      } else {
+      }
+      else
+      {
         move_assign_items(0, rhs.size(), rhs.data());
         destroy_items_after(size());
       }
@@ -386,10 +405,10 @@ class RAJAVec
   void swap_private(RAJAVec& other, std::true_type)
   {
     using std::swap;
-    swap(m_data,      other.m_data);
+    swap(m_data, other.m_data);
     swap(m_allocator, other.m_allocator);
-    swap(m_capacity,  other.m_capacity);
-    swap(m_size,      other.m_size);
+    swap(m_capacity, other.m_capacity);
+    swap(m_size, other.m_size);
   }
 
   ///
@@ -398,9 +417,9 @@ class RAJAVec
   void swap_private(RAJAVec& other, std::false_type)
   {
     using std::swap;
-    swap(m_data,      other.m_data);
-    swap(m_capacity,  other.m_capacity);
-    swap(m_size,      other.m_size);
+    swap(m_data, other.m_data);
+    swap(m_capacity, other.m_capacity);
+    swap(m_size, other.m_size);
   }
 
   //
@@ -408,7 +427,8 @@ class RAJAVec
   //
   void copy_assign_items(size_type first, size_type last, const_pointer o_data)
   {
-    for (size_type i = first; i < last; ++i) {
+    for (size_type i = first; i < last; ++i)
+    {
       m_data[i] = o_data[i];
     }
   }
@@ -418,7 +438,8 @@ class RAJAVec
   //
   void move_assign_items(size_type first, size_type last, pointer o_data)
   {
-    for (size_type i = first; i < last; ++i) {
+    for (size_type i = first; i < last; ++i)
+    {
       m_data[i] = std::move(o_data[i]);
     }
   }
@@ -426,11 +447,13 @@ class RAJAVec
   //
   // Construct items [m_size, new_size) from args.
   //
-  template < typename ... Os >
+  template <typename... Os>
   void construct_items_back(size_type new_size, Os&&... os)
   {
-    for (; m_size < new_size; ++m_size) {
-      allocator_traits_type::construct(m_allocator, m_data+m_size, std::forward<Os>(os)...);
+    for (; m_size < new_size; ++m_size)
+    {
+      allocator_traits_type::construct(m_allocator, m_data + m_size,
+                                       std::forward<Os>(os)...);
     }
   }
 
@@ -439,8 +462,10 @@ class RAJAVec
   //
   void copy_construct_items_back(size_type new_size, const_pointer o_data)
   {
-    for (; m_size < new_size; ++m_size) {
-      allocator_traits_type::construct(m_allocator, m_data+m_size, o_data[m_size]);
+    for (; m_size < new_size; ++m_size)
+    {
+      allocator_traits_type::construct(m_allocator, m_data + m_size,
+                                       o_data[m_size]);
     }
   }
 
@@ -449,8 +474,10 @@ class RAJAVec
   //
   void move_construct_items_back(size_type new_size, pointer o_data)
   {
-    for (; m_size < new_size; ++m_size) {
-      allocator_traits_type::construct(m_allocator, m_data+m_size, std::move(o_data[m_size]));
+    for (; m_size < new_size; ++m_size)
+    {
+      allocator_traits_type::construct(m_allocator, m_data + m_size,
+                                       std::move(o_data[m_size]));
     }
   }
 
@@ -459,39 +486,45 @@ class RAJAVec
   //
   void destroy_items_after(size_type new_end)
   {
-    for (; m_size > new_end; --m_size) {
-      allocator_traits_type::destroy(m_allocator, m_data+m_size-1);
+    for (; m_size > new_end; --m_size)
+    {
+      allocator_traits_type::destroy(m_allocator, m_data + m_size - 1);
     }
   }
 
   //
   // Add an item to the front, shifting all existing items back one.
   //
-  template < typename ... Os >
+  template <typename... Os>
   void emplace_front_private(Os&&... os)
   {
     reserve(m_size + 1);
 
-    if (m_size > 0) {
+    if (m_size > 0)
+    {
       size_type i = m_size;
-      allocator_traits_type::construct(m_allocator, m_data+i, std::move(m_data[i - 1]));
-      for (--i; i > 0; --i) {
+      allocator_traits_type::construct(m_allocator, m_data + i,
+                                       std::move(m_data[i - 1]));
+      for (--i; i > 0; --i)
+      {
         m_data[i] = std::move(m_data[i - 1]);
       }
       allocator_traits_type::destroy(m_allocator, m_data);
     }
-    allocator_traits_type::construct(m_allocator, m_data, std::forward<Os>(os)...);
+    allocator_traits_type::construct(m_allocator, m_data,
+                                     std::forward<Os>(os)...);
     m_size++;
   }
 
   //
   // Add an item to the back.
   //
-  template < typename ... Os >
+  template <typename... Os>
   void emplace_back_private(Os&&... os)
   {
     reserve(m_size + 1);
-    allocator_traits_type::construct(m_allocator, m_data+m_size, std::forward<Os>(os)...);
+    allocator_traits_type::construct(m_allocator, m_data + m_size,
+                                     std::forward<Os>(os)...);
     m_size++;
   }
 
@@ -501,7 +534,7 @@ class RAJAVec
   // relying on STL directly.
   //
   static constexpr const size_type s_init_cap = 8;
-  static constexpr const double s_grow_fac = 1.5;
+  static constexpr const double s_grow_fac    = 1.5;
 
   //
   // Get the next value for capacity given a target and minimum.
@@ -509,7 +542,8 @@ class RAJAVec
   size_type get_next_cap(size_type target_size)
   {
     size_type next_cap = s_init_cap;
-    if (m_capacity != 0) {
+    if (m_capacity != 0)
+    {
       next_cap = static_cast<size_type>(m_capacity * s_grow_fac);
     }
     return std::max(target_size, next_cap);
@@ -520,7 +554,8 @@ class RAJAVec
   //
   void grow_cap(size_type target_size)
   {
-    if (m_capacity < target_size) {
+    if (m_capacity < target_size)
+    {
       change_cap(get_next_cap(target_size));
     }
   }
@@ -530,7 +565,8 @@ class RAJAVec
   //
   void shrink_cap(size_type target_size)
   {
-    if (m_capacity > target_size) {
+    if (m_capacity > target_size)
+    {
       change_cap(std::max(m_size, target_size));
     }
   }
@@ -542,19 +578,23 @@ class RAJAVec
   void change_cap(size_type next_cap)
   {
     pointer tdata = nullptr;
-    if (next_cap != 0) {
+    if (next_cap != 0)
+    {
       tdata = allocator_traits_type::allocate(m_allocator, next_cap);
     }
 
-    if (m_data) {
-      for (size_type i = 0; i < m_size; ++i) {
-        allocator_traits_type::construct(m_allocator, tdata+i, std::move(m_data[i]));
-        allocator_traits_type::destroy(m_allocator, m_data+i);
+    if (m_data)
+    {
+      for (size_type i = 0; i < m_size; ++i)
+      {
+        allocator_traits_type::construct(m_allocator, tdata + i,
+                                         std::move(m_data[i]));
+        allocator_traits_type::destroy(m_allocator, m_data + i);
       }
       allocator_traits_type::deallocate(m_allocator, m_data, m_capacity);
     }
 
-    m_data = tdata;
+    m_data     = tdata;
     m_capacity = next_cap;
   }
 };
diff --git a/include/RAJA/internal/fault_tolerance.hpp b/include/RAJA/internal/fault_tolerance.hpp
index cf3a86cede..66d03ca6cd 100644
--- a/include/RAJA/internal/fault_tolerance.hpp
+++ b/include/RAJA/internal/fault_tolerance.hpp
@@ -37,60 +37,72 @@
 #include <stdio.h>
 #include "cycle.h"
 
-#define RAJA_FT_BEGIN                          \
-  extern volatile int fault_type;              \
-  bool repeat;                                 \
-  bool do_time = false;                        \
-  ticks start = 0, stop = 0;                   \
-  if (fault_type != 0) {                       \
-    printf("Uncaught fault %d\n", fault_type); \
-    fault_type = 0;                            \
-  }                                            \
-  do {                                         \
-    repeat = false;                            \
-    if (do_time) {                             \
-      start = getticks();                      \
+#define RAJA_FT_BEGIN                                                          \
+  extern volatile int fault_type;                                              \
+  bool repeat;                                                                 \
+  bool do_time = false;                                                        \
+  ticks start = 0, stop = 0;                                                   \
+  if (fault_type != 0)                                                         \
+  {                                                                            \
+    printf("Uncaught fault %d\n", fault_type);                                 \
+    fault_type = 0;                                                            \
+  }                                                                            \
+  do                                                                           \
+  {                                                                            \
+    repeat = false;                                                            \
+    if (do_time)                                                               \
+    {                                                                          \
+      start = getticks();                                                      \
     }
 
-#define RAJA_FT_END                                                          \
-  if (do_time) {                                                             \
-    stop = getticks();                                                       \
-    printf("recoverable fault clock cycles = %16f\n", elapsed(stop, start)); \
-    do_time = false;                                                         \
-    fault_type = 0;                                                          \
-  }                                                                          \
-  if (fault_type < 0) {                                                      \
-    printf("Unrecoverable fault (restart penalty)\n");                       \
-    fault_type = 0;                                                          \
-  }                                                                          \
-  if (fault_type > 0) {                                                      \
-    /* invalidate cache */                                                   \
-    repeat = true;                                                           \
-    do_time = true;                                                          \
-  }                                                                          \
-  }                                                                          \
-  while (repeat == true)                                                     \
+#define RAJA_FT_END                                                            \
+  if (do_time)                                                                 \
+  {                                                                            \
+    stop = getticks();                                                         \
+    printf("recoverable fault clock cycles = %16f\n", elapsed(stop, start));   \
+    do_time    = false;                                                        \
+    fault_type = 0;                                                            \
+  }                                                                            \
+  if (fault_type < 0)                                                          \
+  {                                                                            \
+    printf("Unrecoverable fault (restart penalty)\n");                         \
+    fault_type = 0;                                                            \
+  }                                                                            \
+  if (fault_type > 0)                                                          \
+  {                                                                            \
+    /* invalidate cache */                                                     \
+    repeat  = true;                                                            \
+    do_time = true;                                                            \
+  }                                                                            \
+  }                                                                            \
+  while (repeat == true)                                                       \
     ;
 
 #else
-#define RAJA_FT_BEGIN             \
-  extern volatile int fault_type; \
-  bool repeat;                    \
-  if (fault_type == 0) {          \
-    do {                          \
+#define RAJA_FT_BEGIN                                                          \
+  extern volatile int fault_type;                                              \
+  bool repeat;                                                                 \
+  if (fault_type == 0)                                                         \
+  {                                                                            \
+    do                                                                         \
+    {                                                                          \
       repeat = false;
 
-#define RAJA_FT_END        \
-  if (fault_type > 0) {    \
-    /* invalidate cache */ \
-    repeat = true;         \
-    fault_type = 0;        \
-  }                        \
-  }                        \
-  while (repeat == true)   \
-    ;                      \
-  }                        \
-  else { fault_type = 0; /* ignore for the simulation */ }
+#define RAJA_FT_END                                                            \
+  if (fault_type > 0)                                                          \
+  {                                                                            \
+    /* invalidate cache */                                                     \
+    repeat     = true;                                                         \
+    fault_type = 0;                                                            \
+  }                                                                            \
+  }                                                                            \
+  while (repeat == true)                                                       \
+    ;                                                                          \
+  }                                                                            \
+  else                                                                         \
+  {                                                                            \
+    fault_type = 0; /* ignore for the simulation */                            \
+  }
 
 #endif  // RAJA_REPORT_FT
 
diff --git a/include/RAJA/internal/foldl.hpp b/include/RAJA/internal/foldl.hpp
index af65c05392..f16bd9bee4 100644
--- a/include/RAJA/internal/foldl.hpp
+++ b/include/RAJA/internal/foldl.hpp
@@ -44,14 +44,16 @@ template <typename Op, typename... Rest>
 struct foldl_impl;
 
 template <typename Op, typename Arg1>
-struct foldl_impl<Op, Arg1> {
+struct foldl_impl<Op, Arg1>
+{
   using Ret = Arg1;
 };
 
 #if RAJA_HAS_CXX17_IS_INVOCABLE
 
 template <typename Op, typename Arg1, typename Arg2>
-struct foldl_impl<Op, Arg1, Arg2> {
+struct foldl_impl<Op, Arg1, Arg2>
+{
   using Ret = typename std::invoke_result<Op, Arg1, Arg2>::type;
 };
 
@@ -60,18 +62,22 @@ template <typename Op,
           typename Arg2,
           typename Arg3,
           typename... Rest>
-struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...> {
-  using Ret = typename foldl_impl<
-      Op,
-      typename std::invoke_result<Op, typename std::invoke_result<Op, Arg1, Arg2>::type,
-                                      Arg3>::type,
-      Rest...>::Ret;
+struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...>
+{
+  using Ret =
+      typename foldl_impl<Op,
+                          typename std::invoke_result<
+                              Op,
+                              typename std::invoke_result<Op, Arg1, Arg2>::type,
+                              Arg3>::type,
+                          Rest...>::Ret;
 };
 
 #else
 
 template <typename Op, typename Arg1, typename Arg2>
-struct foldl_impl<Op, Arg1, Arg2> {
+struct foldl_impl<Op, Arg1, Arg2>
+{
   using Ret = typename std::result_of<Op(Arg1, Arg2)>::type;
 };
 
@@ -80,7 +86,8 @@ template <typename Op,
           typename Arg2,
           typename Arg3,
           typename... Rest>
-struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...> {
+struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...>
+{
   using Ret = typename foldl_impl<
       Op,
       typename std::result_of<Op(typename std::result_of<Op(Arg1, Arg2)>::type,
@@ -90,20 +97,19 @@ struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...> {
 
 #endif
 
-} // namespace detail
+}  // namespace detail
 
 template <typename Op, typename Arg1>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(
-    Op&& RAJA_UNUSED_ARG(operation),
-    Arg1&& arg) -> typename detail::foldl_impl<Op, Arg1>::Ret
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto
+foldl(Op&& RAJA_UNUSED_ARG(operation), Arg1&& arg) ->
+    typename detail::foldl_impl<Op, Arg1>::Ret
 {
   return camp::forward<Arg1>(arg);
 }
 
 template <typename Op, typename Arg1, typename Arg2>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(Op&& operation,
-                                                  Arg1&& arg1,
-                                                  Arg2&& arg2) ->
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto
+foldl(Op&& operation, Arg1&& arg1, Arg2&& arg2) ->
     typename detail::foldl_impl<Op, Arg1, Arg2>::Ret
 {
   return camp::forward<Op>(operation)(camp::forward<Arg1>(arg1),
@@ -115,11 +121,8 @@ template <typename Op,
           typename Arg2,
           typename Arg3,
           typename... Rest>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(Op&& operation,
-                                                  Arg1&& arg1,
-                                                  Arg2&& arg2,
-                                                  Arg3&& arg3,
-                                                  Rest&&... rest) ->
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto
+foldl(Op&& operation, Arg1&& arg1, Arg2&& arg2, Arg3&& arg3, Rest&&... rest) ->
     typename detail::foldl_impl<Op, Arg1, Arg2, Arg3, Rest...>::Ret
 {
   return foldl(camp::forward<Op>(operation),
diff --git a/include/RAJA/internal/get_platform.hpp b/include/RAJA/internal/get_platform.hpp
index 0354d04bfd..313ef66934 100644
--- a/include/RAJA/internal/get_platform.hpp
+++ b/include/RAJA/internal/get_platform.hpp
@@ -8,18 +8,21 @@
 namespace RAJA
 {
 
-namespace policy {
-namespace multi {
+namespace policy
+{
+namespace multi
+{
 template <typename Selector, typename... Policies>
 class MultiPolicy;
 
 }
-}
+}  // namespace policy
 
-namespace detail 
+namespace detail
 {
 
-struct max_platform {
+struct max_platform
+{
   RAJA_HOST_DEVICE
   RAJA_INLINE
   constexpr RAJA::Platform operator()(const RAJA::Platform& l,
@@ -34,7 +37,8 @@ struct max_platform {
  * This is a catch-all, so anything undefined gets Platform::undefined
  */
 template <typename T, typename = void>
-struct get_platform {
+struct get_platform
+{
   // catch-all: undefined platform
   static constexpr Platform value = Platform::undefined;
 };
@@ -45,7 +49,8 @@ struct get_platform {
  * reduction of them all.
  */
 template <typename... Policies>
-struct get_platform_from_list {
+struct get_platform_from_list
+{
   static constexpr Platform value =
       foldl(max_platform(), get_platform<Policies>::value...);
 };
@@ -54,7 +59,8 @@ struct get_platform_from_list {
  * Define an empty list as Platform::undefined;
  */
 template <>
-struct get_platform_from_list<> {
+struct get_platform_from_list<>
+{
   static constexpr Platform value = Platform::undefined;
 };
 
@@ -67,10 +73,10 @@ struct get_platform_from_list<> {
  */
 template <typename T>
 struct get_platform<T,
-                    typename std::
-                        enable_if<std::is_base_of<RAJA::PolicyBase, T>::value
-                                  && !RAJA::type_traits::is_indexset_policy<T>::
-                                         value>::type> {
+                    typename std::enable_if<
+                        std::is_base_of<RAJA::PolicyBase, T>::value &&
+                        !RAJA::type_traits::is_indexset_policy<T>::value>::type>
+{
 
   static constexpr Platform value = T::platform;
 };
@@ -83,12 +89,13 @@ struct get_platform<T,
  */
 template <typename SEG, typename EXEC>
 struct get_platform<RAJA::ExecPolicy<SEG, EXEC>>
-    : public get_platform_from_list<SEG, EXEC> {
-};
+    : public get_platform_from_list<SEG, EXEC>
+{};
 
 
 template <typename T>
-struct get_statement_platform {
+struct get_statement_platform
+{
   static constexpr Platform value =
       get_platform_from_list<typename T::execution_policy_t,
                              typename T::enclosed_statements_t>::value;
@@ -102,7 +109,8 @@ struct get_statement_platform {
  * each of them.
  */
 template <typename... Stmts>
-struct get_platform<RAJA::internal::StatementList<Stmts...>> {
+struct get_platform<RAJA::internal::StatementList<Stmts...>>
+{
   static constexpr Platform value =
       foldl(max_platform(), get_statement_platform<Stmts>::value...);
 };
@@ -111,7 +119,8 @@ struct get_platform<RAJA::internal::StatementList<Stmts...>> {
  * Specialize for an empty statement list to be undefined
  */
 template <>
-struct get_platform<RAJA::internal::StatementList<>> {
+struct get_platform<RAJA::internal::StatementList<>>
+{
   static constexpr Platform value = Platform::undefined;
 };
 
@@ -120,11 +129,12 @@ struct get_platform<RAJA::internal::StatementList<>> {
 // Once a specific policy is selected, that policy will select the correct
 // platform... see policy_invoker in MultiPolicy.hpp
 template <typename SELECTOR, typename... POLICIES>
-struct get_platform<RAJA::policy::multi::MultiPolicy<SELECTOR, POLICIES...>> {
+struct get_platform<RAJA::policy::multi::MultiPolicy<SELECTOR, POLICIES...>>
+{
   static constexpr Platform value = Platform::undefined;
 };
 
-} // closing brace for detail namespace
-} // closing brace for RAJA namespace
+}  // namespace detail
+}  // namespace RAJA
 
-#endif // RAJA_get_platform_HPP
+#endif  // RAJA_get_platform_HPP
diff --git a/include/RAJA/pattern/WorkGroup.hpp b/include/RAJA/pattern/WorkGroup.hpp
index 767821b8d8..be5abb6848 100644
--- a/include/RAJA/pattern/WorkGroup.hpp
+++ b/include/RAJA/pattern/WorkGroup.hpp
@@ -38,38 +38,44 @@ namespace RAJA
  *
  * \verbatim
 
-   WorkPool<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> pool(allocator);
+   WorkPool<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator>
+ pool(allocator);
 
    pool.enqueue(..., [=] (Index_type i, int* xarg0, int xarg1) {
       xarg0[i] = xarg1;
    });
 
-   WorkGroup<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> group = pool.instantiate();
+   WorkGroup<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> group =
+ pool.instantiate();
 
    int* xarg0 = ...;
    int xarg1 = ...;
-   WorkSite<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> site = group.run(xarg0, xarg1);
+   WorkSite<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> site =
+ group.run(xarg0, xarg1);
 
  * \endverbatim
  *
  ******************************************************************************
  */
-template < typename ... Args >
+template <typename... Args>
 using xargs = camp::list<Args...>;
 
-namespace detail {
+namespace detail
+{
 
-template < typename T >
-struct is_xargs {
+template <typename T>
+struct is_xargs
+{
   static constexpr bool value = false;
 };
 
-template < typename ... Args >
-struct is_xargs<xargs<Args...>> {
+template <typename... Args>
+struct is_xargs<xargs<Args...>>
+{
   static constexpr bool value = true;
 };
 
-}
+}  // namespace detail
 
 
 //
@@ -102,7 +108,8 @@ struct is_xargs<xargs<Args...>> {
       data[i] = 1;
    });
 
-   WorkGroup<WorkGroup_policy, Index_type, xargs<>, Allocator> group = pool.instantiate();
+   WorkGroup<WorkGroup_policy, Index_type, xargs<>, Allocator> group =
+ pool.instantiate();
 
  * \endverbatim
  *
@@ -112,11 +119,13 @@ template <typename WORKGROUP_POLICY_T,
           typename INDEX_T,
           typename EXTRA_ARGS_T,
           typename ALLOCATOR_T>
-struct WorkPool {
-  static_assert(RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
+struct WorkPool
+{
+  static_assert(
+      RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
       "WorkPool: WORKGROUP_POLICY_T must be a workgroup policy");
   static_assert(detail::is_xargs<EXTRA_ARGS_T>::value,
-      "WorkPool: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
+                "WorkPool: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
 };
 
 /*!
@@ -135,9 +144,11 @@ struct WorkPool {
  *
  * \verbatim
 
-   WorkGroup<WorkGroup_policy, Index_type, xargs<>, Allocator> group = pool.instantiate();
+   WorkGroup<WorkGroup_policy, Index_type, xargs<>, Allocator> group =
+ pool.instantiate();
 
-   WorkSite<WorkGroup_policy, Index_type, xargs<>, Allocator> site = group.run();
+   WorkSite<WorkGroup_policy, Index_type, xargs<>, Allocator> site =
+ group.run();
 
  * \endverbatim
  *
@@ -147,11 +158,13 @@ template <typename WORKGROUP_POLICY_T,
           typename INDEX_T,
           typename EXTRA_ARGS_T,
           typename ALLOCATOR_T>
-struct WorkGroup {
-  static_assert(RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
+struct WorkGroup
+{
+  static_assert(
+      RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
       "WorkGroup: WORKGROUP_POLICY_T must be a workgroup policy");
   static_assert(detail::is_xargs<EXTRA_ARGS_T>::value,
-      "WorkGroup: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
+                "WorkGroup: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
 };
 
 /*!
@@ -170,7 +183,8 @@ struct WorkGroup {
  *
  * \verbatim
 
-   WorkSite<WorkGroup_policy, Index_type, xargs<>, Allocator> site = group.run();
+   WorkSite<WorkGroup_policy, Index_type, xargs<>, Allocator> site =
+ group.run();
 
    site.synchronize();
 
@@ -182,11 +196,13 @@ template <typename WORKGROUP_POLICY_T,
           typename INDEX_T,
           typename EXTRA_ARGS_T,
           typename ALLOCATOR_T>
-struct WorkSite {
-  static_assert(RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
+struct WorkSite
+{
+  static_assert(
+      RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
       "WorkSite: WORKGROUP_POLICY_T must be a workgroup policy");
   static_assert(detail::is_xargs<EXTRA_ARGS_T>::value,
-      "WorkSite: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
+                "WorkSite: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
 };
 
 
@@ -195,7 +211,7 @@ template <typename EXEC_POLICY_T,
           typename STORAGE_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename INDEX_T,
-          typename ... Args,
+          typename... Args,
           typename ALLOCATOR_T>
 struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
                                 ORDER_POLICY_T,
@@ -205,23 +221,32 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
                 xargs<Args...>,
                 ALLOCATOR_T>
 {
-  using exec_policy = EXEC_POLICY_T;
-  using order_policy = ORDER_POLICY_T;
-  using storage_policy = STORAGE_POLICY_T;
+  using exec_policy     = EXEC_POLICY_T;
+  using order_policy    = ORDER_POLICY_T;
+  using storage_policy  = STORAGE_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using policy = WorkGroupPolicy<exec_policy, order_policy, storage_policy, dispatch_policy>;
-  using index_type = INDEX_T;
-  using xarg_type = xargs<Args...>;
-  using Allocator = ALLOCATOR_T;
+  using policy          = WorkGroupPolicy<exec_policy,
+                                 order_policy,
+                                 storage_policy,
+                                 dispatch_policy>;
+  using index_type      = INDEX_T;
+  using xarg_type       = xargs<Args...>;
+  using Allocator       = ALLOCATOR_T;
 
   using workgroup_type = WorkGroup<policy, index_type, xarg_type, Allocator>;
-  using worksite_type = WorkSite<policy, index_type, xarg_type, Allocator>;
+  using worksite_type  = WorkSite<policy, index_type, xarg_type, Allocator>;
 
 private:
-  using workrunner_type = detail::WorkRunner<
-      exec_policy, order_policy, dispatch_policy, Allocator, index_type, Args...>;
-  using storage_type = detail::WorkStorage<
-      storage_policy, Allocator, typename workrunner_type::dispatcher_type>;
+  using workrunner_type = detail::WorkRunner<exec_policy,
+                                             order_policy,
+                                             dispatch_policy,
+                                             Allocator,
+                                             index_type,
+                                             Args...>;
+  using storage_type =
+      detail::WorkStorage<storage_policy,
+                          Allocator,
+                          typename workrunner_type::dispatcher_type>;
 
   friend workgroup_type;
   friend worksite_type;
@@ -229,52 +254,45 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
 public:
   using resource_type = typename workrunner_type::resource_type;
 
-  explicit WorkPool(Allocator const& aloc)
-    : m_storage(aloc)
-  { }
+  explicit WorkPool(Allocator const& aloc) : m_storage(aloc) {}
 
-  WorkPool(WorkPool const&) = delete;
+  WorkPool(WorkPool const&)            = delete;
   WorkPool& operator=(WorkPool const&) = delete;
 
-  WorkPool(WorkPool&&) = default;
+  WorkPool(WorkPool&&)            = default;
   WorkPool& operator=(WorkPool&&) = default;
 
-  size_t num_loops() const
-  {
-    return m_storage.size();
-  }
+  size_t num_loops() const { return m_storage.size(); }
 
-  size_t storage_bytes() const
-  {
-    return m_storage.storage_size();
-  }
+  size_t storage_bytes() const { return m_storage.storage_size(); }
 
   void reserve(size_t num_loops, size_t storage_bytes)
   {
     m_storage.reserve(num_loops, storage_bytes);
   }
 
-  template < typename segment_T, typename loop_T >
+  template <typename segment_T, typename loop_T>
   inline void enqueue(segment_T&& seg, loop_T&& loop_body)
   {
     {
       // ignore zero length loops
-      using std::begin; using std::end;
+      using std::begin;
+      using std::end;
       if (begin(seg) == end(seg)) return;
     }
-    if (m_storage.begin() == m_storage.end()) {
+    if (m_storage.begin() == m_storage.end())
+    {
       // perform auto-reserve on reuse
       reserve(m_max_num_loops, m_max_storage_bytes);
     }
 
-    util::PluginContext context{util::make_context<exec_policy>()};
+    util::PluginContext context {util::make_context<exec_policy>()};
     util::callPreCapturePlugins(context);
 
     using RAJA::util::trigger_updates_before;
     auto body = trigger_updates_before(loop_body);
 
-    m_runner.enqueue(
-        m_storage, std::forward<segment_T>(seg), std::move(body));
+    m_runner.enqueue(m_storage, std::forward<segment_T>(seg), std::move(body));
 
     util::callPostCapturePlugins(context);
   }
@@ -289,14 +307,11 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
     m_runner.clear();
   }
 
-  ~WorkPool()
-  {
-    clear();
-  }
+  ~WorkPool() { clear(); }
 
 private:
   storage_type m_storage;
-  size_t m_max_num_loops = 0;
+  size_t m_max_num_loops     = 0;
   size_t m_max_storage_bytes = 0;
 
   workrunner_type m_runner;
@@ -307,7 +322,7 @@ template <typename EXEC_POLICY_T,
           typename STORAGE_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename INDEX_T,
-          typename ... Args,
+          typename... Args,
           typename ALLOCATOR_T>
 struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
                                  ORDER_POLICY_T,
@@ -317,20 +332,23 @@ struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
                  xargs<Args...>,
                  ALLOCATOR_T>
 {
-  using exec_policy = EXEC_POLICY_T;
-  using order_policy = ORDER_POLICY_T;
-  using storage_policy = STORAGE_POLICY_T;
+  using exec_policy     = EXEC_POLICY_T;
+  using order_policy    = ORDER_POLICY_T;
+  using storage_policy  = STORAGE_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using policy = WorkGroupPolicy<exec_policy, order_policy, storage_policy, dispatch_policy>;
-  using index_type = INDEX_T;
-  using xarg_type = xargs<Args...>;
-  using Allocator = ALLOCATOR_T;
+  using policy          = WorkGroupPolicy<exec_policy,
+                                 order_policy,
+                                 storage_policy,
+                                 dispatch_policy>;
+  using index_type      = INDEX_T;
+  using xarg_type       = xargs<Args...>;
+  using Allocator       = ALLOCATOR_T;
 
   using workpool_type = WorkPool<policy, index_type, xarg_type, Allocator>;
   using worksite_type = WorkSite<policy, index_type, xarg_type, Allocator>;
 
 private:
-  using storage_type = typename workpool_type::storage_type;
+  using storage_type    = typename workpool_type::storage_type;
   using workrunner_type = typename workpool_type::workrunner_type;
 
   friend workpool_type;
@@ -339,15 +357,16 @@ struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
 public:
   using resource_type = typename workpool_type::resource_type;
 
-  WorkGroup(WorkGroup const&) = delete;
+  WorkGroup(WorkGroup const&)            = delete;
   WorkGroup& operator=(WorkGroup const&) = delete;
 
-  WorkGroup(WorkGroup&&) = default;
+  WorkGroup(WorkGroup&&)            = default;
   WorkGroup& operator=(WorkGroup&&) = default;
 
   inline worksite_type run(resource_type r, Args...);
 
-  worksite_type run(Args... args) {
+  worksite_type run(Args... args)
+  {
     auto r = resource_type::get_default();
     return run(r, std::move(args)...);
   }
@@ -360,19 +379,15 @@ struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
     m_runner.clear();
   }
 
-  ~WorkGroup()
-  {
-    clear();
-  }
+  ~WorkGroup() { clear(); }
 
 private:
   storage_type m_storage;
   workrunner_type m_runner;
 
   WorkGroup(storage_type&& storage, workrunner_type&& runner)
-    : m_storage(std::move(storage))
-    , m_runner(std::move(runner))
-  { }
+      : m_storage(std::move(storage)), m_runner(std::move(runner))
+  {}
 };
 
 template <typename EXEC_POLICY_T,
@@ -380,7 +395,7 @@ template <typename EXEC_POLICY_T,
           typename STORAGE_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename INDEX_T,
-          typename ... Args,
+          typename... Args,
           typename ALLOCATOR_T>
 struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
                                 ORDER_POLICY_T,
@@ -390,16 +405,19 @@ struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
                 xargs<Args...>,
                 ALLOCATOR_T>
 {
-  using exec_policy = EXEC_POLICY_T;
-  using order_policy = ORDER_POLICY_T;
-  using storage_policy = STORAGE_POLICY_T;
+  using exec_policy     = EXEC_POLICY_T;
+  using order_policy    = ORDER_POLICY_T;
+  using storage_policy  = STORAGE_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using policy = WorkGroupPolicy<exec_policy, order_policy, storage_policy, dispatch_policy>;
-  using index_type = INDEX_T;
-  using xarg_type = xargs<Args...>;
-  using Allocator = ALLOCATOR_T;
-
-  using workpool_type = WorkPool<policy, index_type, xarg_type, Allocator>;
+  using policy          = WorkGroupPolicy<exec_policy,
+                                 order_policy,
+                                 storage_policy,
+                                 dispatch_policy>;
+  using index_type      = INDEX_T;
+  using xarg_type       = xargs<Args...>;
+  using Allocator       = ALLOCATOR_T;
+
+  using workpool_type  = WorkPool<policy, index_type, xarg_type, Allocator>;
   using workgroup_type = WorkGroup<policy, index_type, xarg_type, Allocator>;
 
 private:
@@ -412,16 +430,13 @@ struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
 public:
   using resource_type = typename workpool_type::resource_type;
 
-  WorkSite(WorkSite const&) = delete;
+  WorkSite(WorkSite const&)            = delete;
   WorkSite& operator=(WorkSite const&) = delete;
 
-  WorkSite(WorkSite&&) = default;
+  WorkSite(WorkSite&&)            = default;
   WorkSite& operator=(WorkSite&&) = default;
 
-  resource_type get_resource() const
-  {
-    return m_resource;
-  }
+  resource_type get_resource() const { return m_resource; }
 
   void clear()
   {
@@ -429,19 +444,15 @@ struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
     // TODO: synchronize
   }
 
-  ~WorkSite()
-  {
-    clear();
-  }
+  ~WorkSite() { clear(); }
 
 private:
   per_run_storage m_run_storage;
   resource_type m_resource;
 
   explicit WorkSite(resource_type r, per_run_storage&& run_storage)
-    : m_run_storage(std::move(run_storage))
-    , m_resource(r)
-  { }
+      : m_run_storage(std::move(run_storage)), m_resource(r)
+  {}
 };
 
 
@@ -450,26 +461,29 @@ template <typename EXEC_POLICY_T,
           typename STORAGE_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename INDEX_T,
-          typename ... Args,
+          typename... Args,
           typename ALLOCATOR_T>
-inline
-typename WorkPool<
-    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
-    INDEX_T,
-    xargs<Args...>,
-    ALLOCATOR_T>::workgroup_type
-WorkPool<
-    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
-    INDEX_T,
-    xargs<Args...>,
-    ALLOCATOR_T>::instantiate()
+inline typename WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
+                                         ORDER_POLICY_T,
+                                         STORAGE_POLICY_T,
+                                         DISPATCH_POLICY_T>,
+                         INDEX_T,
+                         xargs<Args...>,
+                         ALLOCATOR_T>::workgroup_type
+WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
+                         ORDER_POLICY_T,
+                         STORAGE_POLICY_T,
+                         DISPATCH_POLICY_T>,
+         INDEX_T,
+         xargs<Args...>,
+         ALLOCATOR_T>::instantiate()
 {
   // update max sizes to auto-reserve on reuse
-  m_max_num_loops = std::max(m_storage.size(), m_max_num_loops);
+  m_max_num_loops     = std::max(m_storage.size(), m_max_num_loops);
   m_max_storage_bytes = std::max(m_storage.storage_size(), m_max_storage_bytes);
 
   // move storage into workgroup
-  return workgroup_type{std::move(m_storage), std::move(m_runner)};
+  return workgroup_type {std::move(m_storage), std::move(m_runner)};
 }
 
 template <typename EXEC_POLICY_T,
@@ -477,30 +491,37 @@ template <typename EXEC_POLICY_T,
           typename STORAGE_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename INDEX_T,
-          typename ... Args,
+          typename... Args,
           typename ALLOCATOR_T>
-inline
-typename WorkGroup<
-    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
-    INDEX_T,
-    xargs<Args...>,
-    ALLOCATOR_T>::worksite_type
+inline typename WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
+                                          ORDER_POLICY_T,
+                                          STORAGE_POLICY_T,
+                                          DISPATCH_POLICY_T>,
+                          INDEX_T,
+                          xargs<Args...>,
+                          ALLOCATOR_T>::worksite_type
 WorkGroup<
-    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
+    WorkGroupPolicy<EXEC_POLICY_T,
+                    ORDER_POLICY_T,
+                    STORAGE_POLICY_T,
+                    DISPATCH_POLICY_T>,
     INDEX_T,
     xargs<Args...>,
-    ALLOCATOR_T>::run(typename WorkGroup<
-                          WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
-                          INDEX_T,
-                          xargs<Args...>,
-                          ALLOCATOR_T>::resource_type r,
+    ALLOCATOR_T>::run(typename WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
+                                                         ORDER_POLICY_T,
+                                                         STORAGE_POLICY_T,
+                                                         DISPATCH_POLICY_T>,
+                                         INDEX_T,
+                                         xargs<Args...>,
+                                         ALLOCATOR_T>::resource_type r,
                       Args... args)
 {
-  util::PluginContext context{util::make_context<EXEC_POLICY_T>()};
+  util::PluginContext context {util::make_context<EXEC_POLICY_T>()};
   util::callPreLaunchPlugins(context);
 
   // move any per run storage into worksite
-  worksite_type site(r, m_runner.run(m_storage, r, std::forward<Args>(args)...));
+  worksite_type site(r,
+                     m_runner.run(m_storage, r, std::forward<Args>(args)...));
 
   util::callPostLaunchPlugins(context);
 
diff --git a/include/RAJA/pattern/WorkGroup/Dispatcher.hpp b/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
index 1eac283f4b..d7c35feb3d 100644
--- a/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
@@ -36,35 +36,36 @@ namespace RAJA
 namespace detail
 {
 
-template < typename >
+template <typename>
 struct DispatcherVoidPtrWrapper
 {
   void* ptr;
   DispatcherVoidPtrWrapper() = default;
   // implicit constructor from void*
-  RAJA_HOST_DEVICE DispatcherVoidPtrWrapper(void* p) : ptr(p) { }
+  RAJA_HOST_DEVICE DispatcherVoidPtrWrapper(void* p) : ptr(p) {}
 };
 
-template < typename >
+template <typename>
 struct DispatcherVoidConstPtrWrapper
 {
   const void* ptr;
   DispatcherVoidConstPtrWrapper() = default;
   // implicit constructor from const void*
-  RAJA_HOST_DEVICE DispatcherVoidConstPtrWrapper(const void* p) : ptr(p) { }
+  RAJA_HOST_DEVICE DispatcherVoidConstPtrWrapper(const void* p) : ptr(p) {}
 };
 
 
-constexpr bool dispatcher_use_host_invoke(Platform platform) {
+constexpr bool dispatcher_use_host_invoke(Platform platform)
+{
   return !(platform == Platform::cuda || platform == Platform::hip);
 }
 
 // Transforms one dispatch policy into another by creating a dispatch policy
 // of holder_type objects. See usage in WorkRunner for more explanation.
-template < typename dispatch_policy, typename holder_type >
+template <typename dispatch_policy, typename holder_type>
 struct dispatcher_transform_types;
 ///
-template < typename dispatch_policy, typename holder_type >
+template <typename dispatch_policy, typename holder_type>
 using dispatcher_transform_types_t =
     typename dispatcher_transform_types<dispatch_policy, holder_type>::type;
 
@@ -75,12 +76,17 @@ using dispatcher_transform_types_t =
  * DispatcherID is used to differentiate function pointers based on their
  * function signature.
  */
-template < Platform platform, typename dispatch_policy, typename DispatcherID, typename ... CallArgs >
+template <Platform platform,
+          typename dispatch_policy,
+          typename DispatcherID,
+          typename... CallArgs>
 struct Dispatcher;
 
 
-template < typename holder_type >
-struct dispatcher_transform_types<::RAJA::indirect_function_call_dispatch, holder_type> {
+template <typename holder_type>
+struct dispatcher_transform_types<::RAJA::indirect_function_call_dispatch,
+                                  holder_type>
+{
   using type = ::RAJA::indirect_function_call_dispatch;
 };
 
@@ -93,38 +99,44 @@ struct dispatcher_transform_types<::RAJA::indirect_function_call_dispatch, holde
  * during device linking when functions with high register counts may cause
  * device linking to fail.
  */
-template < Platform platform, typename DispatcherID, typename ... CallArgs >
-struct Dispatcher<platform, ::RAJA::indirect_function_call_dispatch, DispatcherID, CallArgs...> {
+template <Platform platform, typename DispatcherID, typename... CallArgs>
+struct Dispatcher<platform,
+                  ::RAJA::indirect_function_call_dispatch,
+                  DispatcherID,
+                  CallArgs...>
+{
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
-  using dispatch_policy = ::RAJA::indirect_function_call_dispatch;
-  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
+  using dispatch_policy   = ::RAJA::indirect_function_call_dispatch;
+  using void_ptr_wrapper  = DispatcherVoidPtrWrapper<DispatcherID>;
   using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
 
   ///
   /// move construct an object of type T in dest as a copy of a T from src and
   /// destroy the T obj in src
   ///
-  template < typename T >
-  static void s_move_construct_destroy(void_ptr_wrapper dest, void_ptr_wrapper src)
+  template <typename T>
+  static void s_move_construct_destroy(void_ptr_wrapper dest,
+                                       void_ptr_wrapper src)
   {
     T* dest_as_T = static_cast<T*>(dest.ptr);
-    T* src_as_T = static_cast<T*>(src.ptr);
-    new(dest_as_T) T(std::move(*src_as_T));
+    T* src_as_T  = static_cast<T*>(src.ptr);
+    new (dest_as_T) T(std::move(*src_as_T));
     (*src_as_T).~T();
   }
 
   ///
   /// invoke the call operator of the object of type T in obj with args
   ///
-  template < typename T >
+  template <typename T>
   static void s_host_invoke(void_cptr_wrapper obj, CallArgs... args)
   {
     const T* obj_as_T = static_cast<const T*>(obj.ptr);
     (*obj_as_T)(std::forward<CallArgs>(args)...);
   }
   ///
-  template < typename T >
-  static RAJA_DEVICE void s_device_invoke(void_cptr_wrapper obj, CallArgs... args)
+  template <typename T>
+  static RAJA_DEVICE void s_device_invoke(void_cptr_wrapper obj,
+                                          CallArgs... args)
   {
     const T* obj_as_T = static_cast<const T*>(obj.ptr);
     (*obj_as_T)(std::forward<CallArgs>(args)...);
@@ -133,22 +145,26 @@ struct Dispatcher<platform, ::RAJA::indirect_function_call_dispatch, DispatcherI
   ///
   /// destroy the object of type T in obj
   ///
-  template < typename T >
+  template <typename T>
   static void s_destroy(void_ptr_wrapper obj)
   {
     T* obj_as_T = static_cast<T*>(obj.ptr);
     (*obj_as_T).~T();
   }
 
-  using mover_type = void(*)(void_ptr_wrapper /*dest*/, void_ptr_wrapper /*src*/);
-  using invoker_type = void(*)(void_cptr_wrapper /*obj*/, CallArgs... /*args*/);
-  using destroyer_type = void(*)(void_ptr_wrapper /*obj*/);
+  using mover_type     = void (*)(void_ptr_wrapper /*dest*/,
+                              void_ptr_wrapper /*src*/);
+  using invoker_type   = void (*)(void_cptr_wrapper /*obj*/,
+                                CallArgs... /*args*/);
+  using destroyer_type = void (*)(void_ptr_wrapper /*obj*/);
 
   // This can't be a cuda device lambda due to compiler limitations
-  template < typename T >
-  struct DeviceInvokerFactory {
+  template <typename T>
+  struct DeviceInvokerFactory
+  {
     using value_type = invoker_type;
-    RAJA_DEVICE value_type operator()() {
+    RAJA_DEVICE value_type operator()()
+    {
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
       return nullptr;
 #else
@@ -160,14 +176,14 @@ struct Dispatcher<platform, ::RAJA::indirect_function_call_dispatch, DispatcherI
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template< typename T,
-            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher() {
-    return { mover_type{&s_move_construct_destroy<T>},
-             invoker_type{&s_host_invoke<T>},
-             destroyer_type{&s_destroy<T>},
-             sizeof(T)
-           };
+  template <typename T,
+            bool uhi               = use_host_invoke,
+            std::enable_if_t<uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher()
+  {
+    return {mover_type {&s_move_construct_destroy<T>},
+            invoker_type {&s_host_invoke<T>}, destroyer_type {&s_destroy<T>},
+            sizeof(T)};
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -179,14 +195,16 @@ struct Dispatcher<platform, ::RAJA::indirect_function_call_dispatch, DispatcherI
   /// to create the invoker object. This allows for a separation between
   /// object creation and the device context (cuda, hip, etc) and copying.
   ///
-  template< typename T, typename CreateOnDevice,
-            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice) {
-    return { mover_type{&s_move_construct_destroy<T>},
-             invoker_type{std::forward<CreateOnDevice>(createOnDevice)(DeviceInvokerFactory<T>{})},
-             destroyer_type{&s_destroy<T>},
-             sizeof(T)
-           };
+  template <typename T,
+            typename CreateOnDevice,
+            bool uhi                = use_host_invoke,
+            std::enable_if_t<!uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice)
+  {
+    return {mover_type {&s_move_construct_destroy<T>},
+            invoker_type {std::forward<CreateOnDevice>(createOnDevice)(
+                DeviceInvokerFactory<T> {})},
+            destroyer_type {&s_destroy<T>}, sizeof(T)};
   }
 
   mover_type move_construct_destroy;
@@ -196,8 +214,10 @@ struct Dispatcher<platform, ::RAJA::indirect_function_call_dispatch, DispatcherI
 };
 
 
-template < typename holder_type >
-struct dispatcher_transform_types<::RAJA::indirect_virtual_function_dispatch, holder_type> {
+template <typename holder_type>
+struct dispatcher_transform_types<::RAJA::indirect_virtual_function_dispatch,
+                                  holder_type>
+{
   using type = ::RAJA::indirect_virtual_function_dispatch;
 };
 
@@ -210,38 +230,48 @@ struct dispatcher_transform_types<::RAJA::indirect_virtual_function_dispatch, ho
  * during device linking when functions with high register counts may cause
  * device linking to fail.
  */
-template < Platform platform, typename DispatcherID, typename ... CallArgs >
-struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, DispatcherID, CallArgs...> {
+template <Platform platform, typename DispatcherID, typename... CallArgs>
+struct Dispatcher<platform,
+                  ::RAJA::indirect_virtual_function_dispatch,
+                  DispatcherID,
+                  CallArgs...>
+{
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
-  using dispatch_policy = ::RAJA::indirect_virtual_function_dispatch;
-  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
+  using dispatch_policy   = ::RAJA::indirect_virtual_function_dispatch;
+  using void_ptr_wrapper  = DispatcherVoidPtrWrapper<DispatcherID>;
   using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
 
-  struct impl_base {
-    virtual void move_destroy(void_ptr_wrapper dest, void_ptr_wrapper src) const = 0;
-    virtual void destroy(void_ptr_wrapper obj) const = 0;
+  struct impl_base
+  {
+    virtual void move_destroy(void_ptr_wrapper dest,
+                              void_ptr_wrapper src) const = 0;
+    virtual void destroy(void_ptr_wrapper obj) const      = 0;
   };
 
-  struct host_impl_base {
+  struct host_impl_base
+  {
     virtual void invoke(void_cptr_wrapper obj, CallArgs... args) const = 0;
   };
 
-  struct device_impl_base {
-    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj, CallArgs... args) const = 0;
+  struct device_impl_base
+  {
+    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj,
+                                    CallArgs... args) const = 0;
   };
 
-  template < typename T >
+  template <typename T>
   struct base_impl_type : impl_base
   {
     ///
     /// move construct an object of type T in dest as a copy of a T from src and
     /// destroy the T obj in src
     ///
-    virtual void move_destroy(void_ptr_wrapper dest, void_ptr_wrapper src) const override
+    virtual void move_destroy(void_ptr_wrapper dest,
+                              void_ptr_wrapper src) const override
     {
       T* dest_as_T = static_cast<T*>(dest.ptr);
-      T* src_as_T = static_cast<T*>(src.ptr);
-      new(dest_as_T) T(std::move(*src_as_T));
+      T* src_as_T  = static_cast<T*>(src.ptr);
+      new (dest_as_T) T(std::move(*src_as_T));
       (*src_as_T).~T();
     }
 
@@ -255,7 +285,7 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
     }
   };
 
-  template < typename T >
+  template <typename T>
   struct host_impl_type : host_impl_base
   {
     ///
@@ -268,20 +298,22 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
     }
   };
 
-  template < typename T >
+  template <typename T>
   struct device_impl_type : device_impl_base
   {
     ///
     /// invoke the call operator of the object of type T in obj with args
     ///
-    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj, CallArgs... args) const override
+    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj,
+                                    CallArgs... args) const override
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
       (*obj_as_T)(std::forward<CallArgs>(args)...);
     }
   };
 
-  struct mover_type {
+  struct mover_type
+  {
     impl_base* m_impl;
     void operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const
     {
@@ -289,7 +321,8 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
     }
   };
 
-  struct host_invoker_type {
+  struct host_invoker_type
+  {
     host_impl_base* m_impl;
     void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
@@ -297,30 +330,30 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
     }
   };
   ///
-  struct device_invoker_type {
+  struct device_invoker_type
+  {
     device_impl_base* m_impl;
     RAJA_DEVICE void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
       m_impl->invoke(obj, std::forward<CallArgs>(args)...);
     }
   };
-  using invoker_type = std::conditional_t<use_host_invoke,
-                                          host_invoker_type,
-                                          device_invoker_type>;
+  using invoker_type = std::
+      conditional_t<use_host_invoke, host_invoker_type, device_invoker_type>;
 
-  struct destroyer_type {
+  struct destroyer_type
+  {
     impl_base* m_impl;
-    void operator()(void_ptr_wrapper obj) const
-    {
-      m_impl->destroy(obj);
-    }
+    void operator()(void_ptr_wrapper obj) const { m_impl->destroy(obj); }
   };
 
   // This can't be a cuda device lambda due to compiler limitations
-  template < typename T >
-  struct DeviceImplTypeFactory {
+  template <typename T>
+  struct DeviceImplTypeFactory
+  {
     using value_type = device_impl_type<T>*;
-    RAJA_DEVICE value_type operator()() {
+    RAJA_DEVICE value_type operator()()
+    {
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
       return nullptr;
 #else
@@ -333,16 +366,15 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template< typename T,
-            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher() {
+  template <typename T,
+            bool uhi               = use_host_invoke,
+            std::enable_if_t<uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher()
+  {
     static base_impl_type<T> s_base_impl;
     static host_impl_type<T> s_host_impl;
-    return { mover_type{&s_base_impl},
-             host_invoker_type{&s_host_impl},
-             destroyer_type{&s_base_impl},
-             sizeof(T)
-           };
+    return {mover_type {&s_base_impl}, host_invoker_type {&s_host_impl},
+            destroyer_type {&s_base_impl}, sizeof(T)};
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -354,17 +386,17 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
   /// to create the invoker object. This allows for a separation between
   /// object creation and the device context (cuda, hip, etc) and copying.
   ///
-  template< typename T, typename CreateOnDevice,
-            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr>
-  static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice) {
+  template <typename T,
+            typename CreateOnDevice,
+            bool uhi                = use_host_invoke,
+            std::enable_if_t<!uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice)
+  {
     static base_impl_type<T> s_base_impl;
-    static device_impl_type<T>* s_device_impl_ptr{
-        std::forward<CreateOnDevice>(createOnDevice)(DeviceImplTypeFactory<T>{}) };
-    return { mover_type{&s_base_impl},
-             device_invoker_type{s_device_impl_ptr},
-             destroyer_type{&s_base_impl},
-             sizeof(T)
-           };
+    static device_impl_type<T>* s_device_impl_ptr {std::forward<CreateOnDevice>(
+        createOnDevice)(DeviceImplTypeFactory<T> {})};
+    return {mover_type {&s_base_impl}, device_invoker_type {s_device_impl_ptr},
+            destroyer_type {&s_base_impl}, sizeof(T)};
   }
 
   mover_type move_construct_destroy;
@@ -375,61 +407,68 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
 
 
 // direct_dispatch expects a list of types
-template < typename ... Ts, typename holder_type >
-struct dispatcher_transform_types<::RAJA::direct_dispatch<Ts...>, holder_type> {
-  using type = ::RAJA::direct_dispatch<typename holder_type::template type<Ts>...>;
+template <typename... Ts, typename holder_type>
+struct dispatcher_transform_types<::RAJA::direct_dispatch<Ts...>, holder_type>
+{
+  using type =
+      ::RAJA::direct_dispatch<typename holder_type::template type<Ts>...>;
 };
 
 /*!
  * Version of Dispatcher that does direct dispatch to zero callable types.
  * It implements the interface with callable objects.
  */
-template < Platform platform, typename DispatcherID, typename ... CallArgs >
-struct Dispatcher<platform, ::RAJA::direct_dispatch<>, DispatcherID, CallArgs...> {
+template <Platform platform, typename DispatcherID, typename... CallArgs>
+struct Dispatcher<platform,
+                  ::RAJA::direct_dispatch<>,
+                  DispatcherID,
+                  CallArgs...>
+{
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
-  using dispatch_policy = ::RAJA::direct_dispatch<>;
-  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
+  using dispatch_policy                 = ::RAJA::direct_dispatch<>;
+  using void_ptr_wrapper  = DispatcherVoidPtrWrapper<DispatcherID>;
   using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
 
   ///
   /// move construct an object of type T in dest as a copy of a T from src and
   /// destroy the T obj in src
   ///
-  struct mover_type {
-    void operator()(void_ptr_wrapper, void_ptr_wrapper) const
-    { }
+  struct mover_type
+  {
+    void operator()(void_ptr_wrapper, void_ptr_wrapper) const {}
   };
 
   ///
   /// invoke the call operator of the object of type T in obj with args
   ///
-  struct host_invoker_type {
-    void operator()(void_cptr_wrapper, CallArgs...) const
-    { }
+  struct host_invoker_type
+  {
+    void operator()(void_cptr_wrapper, CallArgs...) const {}
   };
-  struct device_invoker_type {
-    RAJA_DEVICE void operator()(void_cptr_wrapper, CallArgs...) const
-    { }
+  struct device_invoker_type
+  {
+    RAJA_DEVICE void operator()(void_cptr_wrapper, CallArgs...) const {}
   };
-  using invoker_type = std::conditional_t<use_host_invoke,
-                                          host_invoker_type,
-                                          device_invoker_type>;
+  using invoker_type = std::
+      conditional_t<use_host_invoke, host_invoker_type, device_invoker_type>;
 
   ///
   /// destroy the object of type T in obj
   ///
-  struct destroyer_type {
-    void operator()(void_ptr_wrapper) const
-    { }
+  struct destroyer_type
+  {
+    void operator()(void_ptr_wrapper) const {}
   };
 
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template< typename T,
-            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher() {
-    return {mover_type{}, host_invoker_type{}, destroyer_type{}, sizeof(T)};
+  template <typename T,
+            bool uhi               = use_host_invoke,
+            std::enable_if_t<uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher()
+  {
+    return {mover_type {}, host_invoker_type {}, destroyer_type {}, sizeof(T)};
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -437,10 +476,14 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<>, DispatcherID, CallArgs...
   /// Ignore the CreateOnDevice object as the same invoker object can be used
   /// on the host and device.
   ///
-  template< typename T, typename CreateOnDevice,
-            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher(CreateOnDevice&&) {
-    return {mover_type{}, device_invoker_type{}, destroyer_type{}, sizeof(T)};
+  template <typename T,
+            typename CreateOnDevice,
+            bool uhi                = use_host_invoke,
+            std::enable_if_t<!uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher(CreateOnDevice&&)
+  {
+    return {mover_type {}, device_invoker_type {}, destroyer_type {},
+            sizeof(T)};
   }
 
   mover_type move_construct_destroy;
@@ -453,23 +496,31 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<>, DispatcherID, CallArgs...
  * Version of Dispatcher that does direct dispatch to a single callable type.
  * It implements the interface with callable objects.
  */
-template < Platform platform, typename T, typename DispatcherID, typename ... CallArgs >
-struct Dispatcher<platform, ::RAJA::direct_dispatch<T>, DispatcherID, CallArgs...> {
+template <Platform platform,
+          typename T,
+          typename DispatcherID,
+          typename... CallArgs>
+struct Dispatcher<platform,
+                  ::RAJA::direct_dispatch<T>,
+                  DispatcherID,
+                  CallArgs...>
+{
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
-  using dispatch_policy = ::RAJA::direct_dispatch<T>;
-  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
+  using dispatch_policy                 = ::RAJA::direct_dispatch<T>;
+  using void_ptr_wrapper  = DispatcherVoidPtrWrapper<DispatcherID>;
   using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
 
   ///
   /// move construct an object of type T in dest as a copy of a T from src and
   /// destroy the T obj in src
   ///
-  struct mover_type {
+  struct mover_type
+  {
     void operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const
     {
       T* dest_as_T = static_cast<T*>(dest.ptr);
-      T* src_as_T = static_cast<T*>(src.ptr);
-      new(dest_as_T) T(std::move(*src_as_T));
+      T* src_as_T  = static_cast<T*>(src.ptr);
+      new (dest_as_T) T(std::move(*src_as_T));
       (*src_as_T).~T();
     }
   };
@@ -477,28 +528,30 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T>, DispatcherID, CallArgs..
   ///
   /// invoke the call operator of the object of type T in obj with args
   ///
-  struct host_invoker_type {
+  struct host_invoker_type
+  {
     void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
       (*obj_as_T)(std::forward<CallArgs>(args)...);
     }
   };
-  struct device_invoker_type {
+  struct device_invoker_type
+  {
     RAJA_DEVICE void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
       (*obj_as_T)(std::forward<CallArgs>(args)...);
     }
   };
-  using invoker_type = std::conditional_t<use_host_invoke,
-                                          host_invoker_type,
-                                          device_invoker_type>;
+  using invoker_type = std::
+      conditional_t<use_host_invoke, host_invoker_type, device_invoker_type>;
 
   ///
   /// destroy the object of type T in obj
   ///
-  struct destroyer_type {
+  struct destroyer_type
+  {
     void operator()(void_ptr_wrapper obj) const
     {
       T* obj_as_T = static_cast<T*>(obj.ptr);
@@ -509,11 +562,14 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T>, DispatcherID, CallArgs..
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template< typename U,
-            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher() {
-    static_assert(std::is_same<T, U>::value, "U must be in direct_dispatch types");
-    return {mover_type{}, host_invoker_type{}, destroyer_type{}, sizeof(T)};
+  template <typename U,
+            bool uhi               = use_host_invoke,
+            std::enable_if_t<uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher()
+  {
+    static_assert(std::is_same<T, U>::value,
+                  "U must be in direct_dispatch types");
+    return {mover_type {}, host_invoker_type {}, destroyer_type {}, sizeof(T)};
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -521,11 +577,16 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T>, DispatcherID, CallArgs..
   /// Ignore the CreateOnDevice object as the same invoker object can be used
   /// on the host and device.
   ///
-  template< typename U, typename CreateOnDevice,
-            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher(CreateOnDevice&&) {
-    static_assert(std::is_same<T, U>::value, "U must be in direct_dispatch types");
-    return {mover_type{}, device_invoker_type{}, destroyer_type{}, sizeof(T)};
+  template <typename U,
+            typename CreateOnDevice,
+            bool uhi                = use_host_invoke,
+            std::enable_if_t<!uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher(CreateOnDevice&&)
+  {
+    static_assert(std::is_same<T, U>::value,
+                  "U must be in direct_dispatch types");
+    return {mover_type {}, device_invoker_type {}, destroyer_type {},
+            sizeof(T)};
   }
 
   mover_type move_construct_destroy;
@@ -538,46 +599,55 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T>, DispatcherID, CallArgs..
  * Version of Dispatcher that does direct dispatch to multiple callable types.
  * It implements the interface with callable objects.
  */
-template < typename T0, typename T1, typename ... TNs,
-           Platform platform, typename DispatcherID, typename ... CallArgs >
-struct Dispatcher<platform, ::RAJA::direct_dispatch<T0, T1, TNs...>,
-                  DispatcherID, CallArgs...> {
+template <typename T0,
+          typename T1,
+          typename... TNs,
+          Platform platform,
+          typename DispatcherID,
+          typename... CallArgs>
+struct Dispatcher<platform,
+                  ::RAJA::direct_dispatch<T0, T1, TNs...>,
+                  DispatcherID,
+                  CallArgs...>
+{
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
-  using dispatch_policy = ::RAJA::direct_dispatch<T0, T1, TNs...>;
-  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
+  using dispatch_policy   = ::RAJA::direct_dispatch<T0, T1, TNs...>;
+  using void_ptr_wrapper  = DispatcherVoidPtrWrapper<DispatcherID>;
   using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
 
-  using id_type = int;
-  using callable_indices = camp::make_int_seq_t<id_type, 2+sizeof...(TNs)>;
-  using callable_types = camp::list<T0, T1, TNs...>;
+  using id_type          = int;
+  using callable_indices = camp::make_int_seq_t<id_type, 2 + sizeof...(TNs)>;
+  using callable_types   = camp::list<T0, T1, TNs...>;
 
   ///
   /// move construct an object of type T in dest as a copy of a T from src and
   /// destroy the T obj in src
   ///
-  struct mover_type {
+  struct mover_type
+  {
     id_type id;
 
     void operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const
     {
-      impl_helper(callable_indices{}, callable_types{},
-                  dest, src);
+      impl_helper(callable_indices {}, callable_types {}, dest, src);
     }
 
   private:
-    template < int ... id_types, typename ... Ts >
-    void impl_helper(camp::int_seq<int, id_types...>, camp::list<Ts...>,
-              void_ptr_wrapper dest, void_ptr_wrapper src) const
+    template <int... id_types, typename... Ts>
+    void impl_helper(camp::int_seq<int, id_types...>,
+                     camp::list<Ts...>,
+                     void_ptr_wrapper dest,
+                     void_ptr_wrapper src) const
     {
       camp::sink(((id_types == id) ? (impl<Ts>(dest, src), 0) : 0)...);
     }
 
-    template < typename T >
+    template <typename T>
     void impl(void_ptr_wrapper dest, void_ptr_wrapper src) const
     {
       T* dest_as_T = static_cast<T*>(dest.ptr);
-      T* src_as_T = static_cast<T*>(src.ptr);
-      new(dest_as_T) T(std::move(*src_as_T));
+      T* src_as_T  = static_cast<T*>(src.ptr);
+      new (dest_as_T) T(std::move(*src_as_T));
       (*src_as_T).~T();
     }
   };
@@ -585,79 +655,89 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T0, T1, TNs...>,
   ///
   /// invoke the call operator of the object of type T in obj with args
   ///
-  struct host_invoker_type {
+  struct host_invoker_type
+  {
     id_type id;
 
     void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
-      impl_helper(callable_indices{}, callable_types{},
-                  obj, std::forward<CallArgs>(args)...);
+      impl_helper(callable_indices {}, callable_types {}, obj,
+                  std::forward<CallArgs>(args)...);
     }
 
   private:
-    template < int ... id_types, typename ... Ts >
-    void impl_helper(camp::int_seq<int, id_types...>, camp::list<Ts...>,
-              void_cptr_wrapper obj, CallArgs... args) const
+    template <int... id_types, typename... Ts>
+    void impl_helper(camp::int_seq<int, id_types...>,
+                     camp::list<Ts...>,
+                     void_cptr_wrapper obj,
+                     CallArgs... args) const
     {
-      camp::sink(((id_types == id) ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0) : 0)...);
+      camp::sink(((id_types == id)
+                      ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0)
+                      : 0)...);
     }
 
-    template < typename T >
+    template <typename T>
     void impl(void_cptr_wrapper obj, CallArgs... args) const
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
       (*obj_as_T)(std::forward<CallArgs>(args)...);
     }
   };
-  struct device_invoker_type {
+  struct device_invoker_type
+  {
     id_type id;
 
     RAJA_DEVICE void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
-      impl_helper(callable_indices{}, callable_types{},
-                  obj, std::forward<CallArgs>(args)...);
+      impl_helper(callable_indices {}, callable_types {}, obj,
+                  std::forward<CallArgs>(args)...);
     }
 
   private:
-    template < int ... id_types, typename ... Ts >
-    RAJA_DEVICE void impl_helper(camp::int_seq<int, id_types...>, camp::list<Ts...>,
-              void_cptr_wrapper obj, CallArgs... args) const
+    template <int... id_types, typename... Ts>
+    RAJA_DEVICE void impl_helper(camp::int_seq<int, id_types...>,
+                                 camp::list<Ts...>,
+                                 void_cptr_wrapper obj,
+                                 CallArgs... args) const
     {
-      camp::sink(((id_types == id) ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0) : 0)...);
+      camp::sink(((id_types == id)
+                      ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0)
+                      : 0)...);
     }
 
-    template < typename T >
+    template <typename T>
     RAJA_DEVICE void impl(void_cptr_wrapper obj, CallArgs... args) const
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
       (*obj_as_T)(std::forward<CallArgs>(args)...);
     }
   };
-  using invoker_type = std::conditional_t<use_host_invoke,
-                                          host_invoker_type,
-                                          device_invoker_type>;
+  using invoker_type = std::
+      conditional_t<use_host_invoke, host_invoker_type, device_invoker_type>;
 
   ///
   /// destroy the object of type T in obj
   ///
-  struct destroyer_type {
+  struct destroyer_type
+  {
     id_type id;
 
     void operator()(void_ptr_wrapper obj) const
     {
-      impl_helper(callable_indices{}, callable_types{},
-                  obj);
+      impl_helper(callable_indices {}, callable_types {}, obj);
     }
 
   private:
-    template < int ... id_types, typename ... Ts >
-    void impl_helper(camp::int_seq<int, id_types...>, camp::list<Ts...>,
-              void_ptr_wrapper obj) const
+    template <int... id_types, typename... Ts>
+    void impl_helper(camp::int_seq<int, id_types...>,
+                     camp::list<Ts...>,
+                     void_ptr_wrapper obj) const
     {
       camp::sink(((id_types == id) ? (impl<Ts>(obj), 0) : 0)...);
     }
 
-    template < typename T >
+    template <typename T>
     void impl(void_ptr_wrapper obj) const
     {
       T* obj_as_T = static_cast<T*>(obj.ptr);
@@ -671,25 +751,31 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T0, T1, TNs...>,
   /// The id is just the index of T in the list of callable_types.
   /// If T is not in Ts return -1.
   ///
-  template < typename T, int ... id_types, typename ... Ts >
-  static constexpr id_type get_id(camp::int_seq<int, id_types...>, camp::list<Ts...>)
+  template <typename T, int... id_types, typename... Ts>
+  static constexpr id_type get_id(camp::int_seq<int, id_types...>,
+                                  camp::list<Ts...>)
   {
-    id_type id{-1};
+    id_type id {-1};
     // quiet UB warning by sequencing assignment to id with list initialization
-    int unused[] {0, (std::is_same<T, Ts>::value ? ((id = id_types), 0) : 0)...};
-    camp::sink(unused); // quiet unused var warning
+    int unused[] {0,
+                  (std::is_same<T, Ts>::value ? ((id = id_types), 0) : 0)...};
+    camp::sink(unused);  // quiet unused var warning
     return id;
   }
 
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template< typename T,
-            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher() {
-    static constexpr id_type id = get_id<T>(callable_indices{}, callable_types{});
+  template <typename T,
+            bool uhi               = use_host_invoke,
+            std::enable_if_t<uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher()
+  {
+    static constexpr id_type id =
+        get_id<T>(callable_indices {}, callable_types {});
     static_assert(id != id_type(-1), "T must be in direct_dispatch types");
-    return {mover_type{id}, host_invoker_type{id}, destroyer_type{id}, sizeof(T)};
+    return {mover_type {id}, host_invoker_type {id}, destroyer_type {id},
+            sizeof(T)};
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -697,12 +783,17 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T0, T1, TNs...>,
   /// Ignore the CreateOnDevice object as the same invoker object can be used
   /// on the host and device.
   ///
-  template< typename T, typename CreateOnDevice,
-            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher(CreateOnDevice&&) {
-    static constexpr id_type id = get_id<T>(callable_indices{}, callable_types{});
+  template <typename T,
+            typename CreateOnDevice,
+            bool uhi                = use_host_invoke,
+            std::enable_if_t<!uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher(CreateOnDevice&&)
+  {
+    static constexpr id_type id =
+        get_id<T>(callable_indices {}, callable_types {});
     static_assert(id != id_type(-1), "T must be in direct_dispatch types");
-    return {mover_type{id}, device_invoker_type{id}, destroyer_type{id}, sizeof(T)};
+    return {mover_type {id}, device_invoker_type {id}, destroyer_type {id},
+            sizeof(T)};
   }
 
   mover_type move_construct_destroy;
diff --git a/include/RAJA/pattern/WorkGroup/WorkRunner.hpp b/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
index 9645f73050..5a666d1c73 100644
--- a/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
@@ -40,18 +40,18 @@ namespace detail
 /*!
  * A body and args holder for storing loops that are being executed in foralls
  */
-template <typename LoopBody, typename ... Args>
+template <typename LoopBody, typename... Args>
 struct HoldBodyArgs_base
 {
   // NOTE: This constructor is disabled when body_in is not LoopBody
   // to avoid it conflicting with the copy and move constructors
-  template < typename body_in,
-      typename = typename std::enable_if<
-        std::is_same<LoopBody, camp::decay<body_in>>::value>::type >
+  template <typename body_in,
+            typename = typename std::enable_if<
+                std::is_same<LoopBody, camp::decay<body_in>>::value>::type>
   HoldBodyArgs_base(body_in&& body, Args... args)
-    : m_body(std::forward<body_in>(body))
-    , m_arg_tuple(std::forward<Args>(args)...)
-  { }
+      : m_body(std::forward<body_in>(body)),
+        m_arg_tuple(std::forward<Args>(args)...)
+  {}
 
 protected:
   LoopBody m_body;
@@ -62,7 +62,7 @@ struct HoldBodyArgs_base
  * A body and args holder for storing loops that are being executed in foralls
  * that run on the host
  */
-template <typename LoopBody, typename index_type, typename ... Args>
+template <typename LoopBody, typename index_type, typename... Args>
 struct HoldBodyArgs_host : HoldBodyArgs_base<LoopBody, Args...>
 {
   using base = HoldBodyArgs_base<LoopBody, Args...>;
@@ -70,10 +70,10 @@ struct HoldBodyArgs_host : HoldBodyArgs_base<LoopBody, Args...>
 
   RAJA_INLINE void operator()(index_type i) const
   {
-    invoke(i, camp::make_idx_seq_t<sizeof...(Args)>{});
+    invoke(i, camp::make_idx_seq_t<sizeof...(Args)> {});
   }
 
-  template < camp::idx_t ... Is >
+  template <camp::idx_t... Is>
   RAJA_INLINE void invoke(index_type i, camp::idx_seq<Is...>) const
   {
     this->m_body(i, get<Is>(this->m_arg_tuple)...);
@@ -84,7 +84,7 @@ struct HoldBodyArgs_host : HoldBodyArgs_base<LoopBody, Args...>
  * A body and args holder for storing loops that are being executed in foralls
  * that run on the device
  */
-template <typename LoopBody, typename index_type, typename ... Args>
+template <typename LoopBody, typename index_type, typename... Args>
 struct HoldBodyArgs_device : HoldBodyArgs_base<LoopBody, Args...>
 {
   using base = HoldBodyArgs_base<LoopBody, Args...>;
@@ -92,10 +92,10 @@ struct HoldBodyArgs_device : HoldBodyArgs_base<LoopBody, Args...>
 
   RAJA_DEVICE RAJA_INLINE void operator()(index_type i) const
   {
-    invoke(i, camp::make_idx_seq_t<sizeof...(Args)>{});
+    invoke(i, camp::make_idx_seq_t<sizeof...(Args)> {});
   }
 
-  template < camp::idx_t ... Is >
+  template <camp::idx_t... Is>
   RAJA_DEVICE RAJA_INLINE void invoke(index_type i, camp::idx_seq<Is...>) const
   {
     this->m_body(i, get<Is>(this->m_arg_tuple)...);
@@ -105,28 +105,29 @@ struct HoldBodyArgs_device : HoldBodyArgs_base<LoopBody, Args...>
 /*!
  * A body and segment holder for storing loops that will be executed as foralls
  */
-template <typename ExecutionPolicy, typename Segment_type, typename LoopBody,
-          typename index_type, typename ... Args>
+template <typename ExecutionPolicy,
+          typename Segment_type,
+          typename LoopBody,
+          typename index_type,
+          typename... Args>
 struct HoldForall
 {
   using resource_type = typename resources::get_resource<ExecutionPolicy>::type;
-  using HoldBodyArgs = typename std::conditional<
+  using HoldBodyArgs  = typename std::conditional<
       !type_traits::is_device_exec_policy<ExecutionPolicy>::value,
       HoldBodyArgs_host<LoopBody, index_type, Args...>,
-      HoldBodyArgs_device<LoopBody, index_type, Args...> >::type;
+      HoldBodyArgs_device<LoopBody, index_type, Args...>>::type;
 
-  template < typename segment_in, typename body_in >
+  template <typename segment_in, typename body_in>
   HoldForall(segment_in&& segment, body_in&& body)
-    : m_segment(std::forward<segment_in>(segment))
-    , m_body(std::forward<body_in>(body))
-  { }
+      : m_segment(std::forward<segment_in>(segment)),
+        m_body(std::forward<body_in>(body))
+  {}
 
   RAJA_INLINE void operator()(resource_type r, Args... args) const
   {
-    wrap::forall(r,
-                 ExecutionPolicy(),
-                 m_segment,
-                 HoldBodyArgs{m_body, std::forward<Args>(args)...});
+    wrap::forall(r, ExecutionPolicy(), m_segment,
+                 HoldBodyArgs {m_body, std::forward<Args>(args)...});
   }
 
 private:
@@ -143,7 +144,7 @@ template <typename EXEC_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunner;
 
 
@@ -156,28 +157,32 @@ template <typename FORALL_EXEC_POLICY,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunnerForallOrdered_base
 {
-  using exec_policy = EXEC_POLICY_T;
-  using order_policy = ORDER_POLICY_T;
+  using exec_policy     = EXEC_POLICY_T;
+  using order_policy    = ORDER_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using Allocator = ALLOCATOR_T;
-  using index_type = INDEX_T;
-  using resource_type = typename resources::get_resource<FORALL_EXEC_POLICY>::type;
+  using Allocator       = ALLOCATOR_T;
+  using index_type      = INDEX_T;
+  using resource_type =
+      typename resources::get_resource<FORALL_EXEC_POLICY>::type;
 
   using forall_exec_policy = FORALL_EXEC_POLICY;
 
   // The type that will hold the segment and loop body in work storage
-  struct holder_type {
-    template < typename T >
-    using type = HoldForall<forall_exec_policy,
-                            typename camp::at<T, camp::num<0>>::type, // segment_type
-                            typename camp::at<T, camp::num<1>>::type, // loop_type
-                            index_type, Args...>;
+  struct holder_type
+  {
+    template <typename T>
+    using type =
+        HoldForall<forall_exec_policy,
+                   typename camp::at<T, camp::num<0>>::type,  // segment_type
+                   typename camp::at<T, camp::num<1>>::type,  // loop_type
+                   index_type,
+                   Args...>;
   };
   ///
-  template < typename T >
+  template <typename T>
   using holder_type_t = typename holder_type::template type<T>;
 
   // The policy indicating where the call function is invoked
@@ -186,33 +191,40 @@ struct WorkRunnerForallOrdered_base
 
   // The Dispatcher policy with holder_types used internally to handle the
   // ranges and callables passed in by the user.
-  using dispatcher_holder_policy = dispatcher_transform_types_t<dispatch_policy, holder_type>;
+  using dispatcher_holder_policy =
+      dispatcher_transform_types_t<dispatch_policy, holder_type>;
 
-  using dispatcher_type = Dispatcher<Platform::host, dispatcher_holder_policy, void, resource_type, Args...>;
+  using dispatcher_type = Dispatcher<Platform::host,
+                                     dispatcher_holder_policy,
+                                     void,
+                                     resource_type,
+                                     Args...>;
 
   WorkRunnerForallOrdered_base() = default;
 
   WorkRunnerForallOrdered_base(WorkRunnerForallOrdered_base const&) = delete;
-  WorkRunnerForallOrdered_base& operator=(WorkRunnerForallOrdered_base const&) = delete;
+  WorkRunnerForallOrdered_base&
+  operator=(WorkRunnerForallOrdered_base const&) = delete;
 
-  WorkRunnerForallOrdered_base(WorkRunnerForallOrdered_base &&) = default;
-  WorkRunnerForallOrdered_base& operator=(WorkRunnerForallOrdered_base &&) = default;
+  WorkRunnerForallOrdered_base(WorkRunnerForallOrdered_base&&) = default;
+  WorkRunnerForallOrdered_base&
+  operator=(WorkRunnerForallOrdered_base&&) = default;
 
   // runner interfaces with storage to enqueue so the runner can get
   // information from the segment and loop at enqueue time
-  template < typename WorkContainer, typename segment_T, typename loop_T >
+  template <typename WorkContainer, typename segment_T, typename loop_T>
   inline void enqueue(WorkContainer& storage, segment_T&& seg, loop_T&& loop)
   {
-    using holder = holder_type_t<camp::list<camp::decay<segment_T>, camp::decay<loop_T>>>;
+    using holder =
+        holder_type_t<camp::list<camp::decay<segment_T>, camp::decay<loop_T>>>;
 
     storage.template emplace<holder>(
-        get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy{}),
+        get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy {}),
         std::forward<segment_T>(seg), std::forward<loop_T>(loop));
   }
 
   // clear any state so ready to be destroyed or reused
-  void clear()
-  { }
+  void clear() {}
 
   // no extra storage required here
   using per_run_storage = int;
@@ -227,39 +239,38 @@ template <typename FORALL_EXEC_POLICY,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunnerForallOrdered
-    : WorkRunnerForallOrdered_base<
-      FORALL_EXEC_POLICY,
-      EXEC_POLICY_T,
-      ORDER_POLICY_T,
-      DISPATCH_POLICY_T,
-      ALLOCATOR_T,
-      INDEX_T,
-      Args...>
+    : WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
+                                   EXEC_POLICY_T,
+                                   ORDER_POLICY_T,
+                                   DISPATCH_POLICY_T,
+                                   ALLOCATOR_T,
+                                   INDEX_T,
+                                   Args...>
 {
-  using base = WorkRunnerForallOrdered_base<
-      FORALL_EXEC_POLICY,
-      EXEC_POLICY_T,
-      ORDER_POLICY_T,
-      DISPATCH_POLICY_T,
-      ALLOCATOR_T,
-      INDEX_T,
-      Args...>;
+  using base = WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
+                                            EXEC_POLICY_T,
+                                            ORDER_POLICY_T,
+                                            DISPATCH_POLICY_T,
+                                            ALLOCATOR_T,
+                                            INDEX_T,
+                                            Args...>;
   using base::base;
 
   // run the loops using forall in the order that they were enqueued
-  template < typename WorkContainer >
+  template <typename WorkContainer>
   typename base::per_run_storage run(WorkContainer const& storage,
                                      typename base::resource_type r,
                                      Args... args) const
   {
     using value_type = typename WorkContainer::value_type;
 
-    typename base::per_run_storage run_storage{};
+    typename base::per_run_storage run_storage {};
 
     auto end = storage.end();
-    for (auto iter = storage.begin(); iter != end; ++iter) {
+    for (auto iter = storage.begin(); iter != end; ++iter)
+    {
       value_type::host_call(&*iter, r, args...);
     }
 
@@ -276,40 +287,40 @@ template <typename FORALL_EXEC_POLICY,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunnerForallReverse
-    : WorkRunnerForallOrdered_base<
-      FORALL_EXEC_POLICY,
-      EXEC_POLICY_T,
-      ORDER_POLICY_T,
-      DISPATCH_POLICY_T,
-      ALLOCATOR_T,
-      INDEX_T,
-      Args...>
+    : WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
+                                   EXEC_POLICY_T,
+                                   ORDER_POLICY_T,
+                                   DISPATCH_POLICY_T,
+                                   ALLOCATOR_T,
+                                   INDEX_T,
+                                   Args...>
 {
-  using base = WorkRunnerForallOrdered_base<
-      FORALL_EXEC_POLICY,
-      EXEC_POLICY_T,
-      ORDER_POLICY_T,
-      DISPATCH_POLICY_T,
-      ALLOCATOR_T,
-      INDEX_T,
-      Args...>;
+  using base = WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
+                                            EXEC_POLICY_T,
+                                            ORDER_POLICY_T,
+                                            DISPATCH_POLICY_T,
+                                            ALLOCATOR_T,
+                                            INDEX_T,
+                                            Args...>;
   using base::base;
 
-  // run the loops using forall in the reverse order to the order they were enqueued
-  template < typename WorkContainer >
+  // run the loops using forall in the reverse order to the order they were
+  // enqueued
+  template <typename WorkContainer>
   typename base::per_run_storage run(WorkContainer const& storage,
                                      typename base::resource_type r,
                                      Args... args) const
   {
     using value_type = typename WorkContainer::value_type;
 
-    typename base::per_run_storage run_storage{};
+    typename base::per_run_storage run_storage {};
 
     auto begin = storage.begin();
-    for (auto iter = storage.end(); iter != begin; --iter) {
-      value_type::host_call(&*(iter-1), r, args...);
+    for (auto iter = storage.end(); iter != begin; --iter)
+    {
+      value_type::host_call(&*(iter - 1), r, args...);
     }
 
     return run_storage;
diff --git a/include/RAJA/pattern/WorkGroup/WorkStorage.hpp b/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
index 52631d108f..d7eceaef7f 100644
--- a/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
@@ -46,23 +46,23 @@ namespace detail
 //   operator -  ( iterator_base const& )
 //   operator == ( iterator_base const& )
 //   operator <  ( iterator_base const& )
-template < typename iterator_base >
+template <typename iterator_base>
 struct random_access_iterator : iterator_base
 {
-  using base = iterator_base;
-  using value_type = const typename base::value_type;
-  using pointer = typename base::pointer;
-  using reference = typename base::reference;
-  using difference_type = typename base::difference_type;
+  using base              = iterator_base;
+  using value_type        = const typename base::value_type;
+  using pointer           = typename base::pointer;
+  using reference         = typename base::reference;
+  using difference_type   = typename base::difference_type;
   using iterator_category = std::random_access_iterator_tag;
 
   using base::base;
 
   random_access_iterator(random_access_iterator const&) = default;
-  random_access_iterator(random_access_iterator &&) = default;
+  random_access_iterator(random_access_iterator&&)      = default;
 
   random_access_iterator& operator=(random_access_iterator const&) = default;
-  random_access_iterator& operator=(random_access_iterator &&) = default;
+  random_access_iterator& operator=(random_access_iterator&&)      = default;
 
 
   RAJA_HOST_DEVICE reference operator*() const
@@ -70,10 +70,7 @@ struct random_access_iterator : iterator_base
     return *static_cast<base const&>(*this);
   }
 
-  RAJA_HOST_DEVICE pointer operator->() const
-  {
-    return &(*(*this));
-  }
+  RAJA_HOST_DEVICE pointer operator->() const { return &(*(*this)); }
 
   RAJA_HOST_DEVICE reference operator[](difference_type i) const
   {
@@ -120,68 +117,75 @@ struct random_access_iterator : iterator_base
     return *this;
   }
 
-  RAJA_HOST_DEVICE friend inline random_access_iterator operator+(
-      random_access_iterator const& lhs, difference_type rhs)
+  RAJA_HOST_DEVICE friend inline random_access_iterator
+  operator+(random_access_iterator const& lhs, difference_type rhs)
   {
     random_access_iterator copy = lhs;
     copy += rhs;
     return copy;
   }
 
-  RAJA_HOST_DEVICE friend inline random_access_iterator operator+(
-      difference_type lhs, random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline random_access_iterator
+  operator+(difference_type lhs, random_access_iterator const& rhs)
   {
     random_access_iterator copy = rhs;
     copy += lhs;
     return copy;
   }
 
-  RAJA_HOST_DEVICE friend inline random_access_iterator operator-(
-      random_access_iterator const& lhs, difference_type rhs)
+  RAJA_HOST_DEVICE friend inline random_access_iterator
+  operator-(random_access_iterator const& lhs, difference_type rhs)
   {
     random_access_iterator copy = lhs;
     copy -= rhs;
     return copy;
   }
 
-  RAJA_HOST_DEVICE friend inline difference_type operator-(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline difference_type
+  operator-(random_access_iterator const& lhs,
+            random_access_iterator const& rhs)
   {
     return static_cast<base const&>(lhs) - static_cast<base const&>(rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool operator==(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool
+  operator==(random_access_iterator const& lhs,
+             random_access_iterator const& rhs)
   {
     return static_cast<base const&>(lhs) == static_cast<base const&>(rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool operator!=(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool
+  operator!=(random_access_iterator const& lhs,
+             random_access_iterator const& rhs)
   {
     return !(lhs == rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool operator<(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool
+  operator<(random_access_iterator const& lhs,
+            random_access_iterator const& rhs)
   {
     return static_cast<base const&>(lhs) < static_cast<base const&>(rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool operator<=(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool
+  operator<=(random_access_iterator const& lhs,
+             random_access_iterator const& rhs)
   {
     return !(rhs < lhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool operator>(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool
+  operator>(random_access_iterator const& lhs,
+            random_access_iterator const& rhs)
   {
     return rhs < lhs;
   }
 
-  RAJA_HOST_DEVICE friend inline bool operator>=(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool
+  operator>=(random_access_iterator const& lhs,
+             random_access_iterator const& rhs)
   {
     return !(lhs < rhs);
   }
@@ -191,10 +195,12 @@ struct random_access_iterator : iterator_base
 /*!
  * A storage container for work groups
  */
-template < typename STORAGE_POLICY_T, typename ALLOCATOR_T, typename Dispatcher_T >
+template <typename STORAGE_POLICY_T,
+          typename ALLOCATOR_T,
+          typename Dispatcher_T>
 class WorkStorage;
 
-template < typename ALLOCATOR_T, typename Dispatcher_T >
+template <typename ALLOCATOR_T, typename Dispatcher_T>
 class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
 {
   using allocator_traits_type = std::allocator_traits<ALLOCATOR_T>;
@@ -202,25 +208,27 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
       typename allocator_traits_type::propagate_on_container_copy_assignment;
   using propagate_on_container_move_assignment =
       typename allocator_traits_type::propagate_on_container_move_assignment;
-  using propagate_on_container_swap            =
+  using propagate_on_container_swap =
       typename allocator_traits_type::propagate_on_container_swap;
-  static_assert(std::is_same<typename allocator_traits_type::value_type, char>::value,
+  static_assert(
+      std::is_same<typename allocator_traits_type::value_type, char>::value,
       "WorkStorage expects an allocator for 'char's.");
+
 public:
-  using storage_policy = RAJA::array_of_pointers;
+  using storage_policy  = RAJA::array_of_pointers;
   using dispatcher_type = Dispatcher_T;
 
-  template < typename holder >
+  template <typename holder>
   using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
 
-  using value_type = GenericWorkStruct<dispatcher_type>;
-  using allocator_type = ALLOCATOR_T;
-  using size_type = std::size_t;
+  using value_type      = GenericWorkStruct<dispatcher_type>;
+  using allocator_type  = ALLOCATOR_T;
+  using size_type       = std::size_t;
   using difference_type = std::ptrdiff_t;
-  using reference = value_type&;
+  using reference       = value_type&;
   using const_reference = const value_type&;
-  using pointer = value_type*;
-  using const_pointer = const value_type*;
+  using pointer         = value_type*;
+  using const_pointer   = const value_type*;
 
 private:
   // struct used in storage vector to retain pointer and allocation size
@@ -231,24 +239,19 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   };
 
 public:
-
-  // iterator base class for accessing stored WorkStructs outside of the container
+  // iterator base class for accessing stored WorkStructs outside of the
+  // container
   struct const_iterator_base
   {
-    using value_type = const typename WorkStorage::value_type;
-    using pointer = typename WorkStorage::const_pointer;
-    using reference = typename WorkStorage::const_reference;
-    using difference_type = typename WorkStorage::difference_type;
+    using value_type        = const typename WorkStorage::value_type;
+    using pointer           = typename WorkStorage::const_pointer;
+    using reference         = typename WorkStorage::const_reference;
+    using difference_type   = typename WorkStorage::difference_type;
     using iterator_category = std::random_access_iterator_tag;
 
-    const_iterator_base(const pointer_and_size* ptrptr)
-      : m_ptrptr(ptrptr)
-    { }
+    const_iterator_base(const pointer_and_size* ptrptr) : m_ptrptr(ptrptr) {}
 
-    RAJA_HOST_DEVICE reference operator*() const
-    {
-      return *(m_ptrptr->ptr);
-    }
+    RAJA_HOST_DEVICE reference operator*() const { return *(m_ptrptr->ptr); }
 
     RAJA_HOST_DEVICE const_iterator_base& operator+=(difference_type n)
     {
@@ -256,20 +259,23 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
       return *this;
     }
 
-    RAJA_HOST_DEVICE friend inline difference_type operator-(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline difference_type
+    operator-(const_iterator_base const& lhs_iter,
+              const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_ptrptr - rhs_iter.m_ptrptr;
     }
 
-    RAJA_HOST_DEVICE friend inline bool operator==(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool
+    operator==(const_iterator_base const& lhs_iter,
+               const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_ptrptr == rhs_iter.m_ptrptr;
     }
 
-    RAJA_HOST_DEVICE friend inline bool operator<(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool
+    operator<(const_iterator_base const& lhs_iter,
+              const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_ptrptr < rhs_iter.m_ptrptr;
     }
@@ -282,22 +288,22 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
 
 
   explicit WorkStorage(allocator_type const& aloc)
-    : m_vec(0, aloc)
-    , m_aloc(aloc)
-  { }
+      : m_vec(0, aloc), m_aloc(aloc)
+  {}
 
-  WorkStorage(WorkStorage const&) = delete;
+  WorkStorage(WorkStorage const&)            = delete;
   WorkStorage& operator=(WorkStorage const&) = delete;
 
   WorkStorage(WorkStorage&& rhs)
-    : m_vec(std::move(rhs.m_vec))
-    , m_aloc(std::move(rhs.m_aloc))
-  { }
+      : m_vec(std::move(rhs.m_vec)), m_aloc(std::move(rhs.m_aloc))
+  {}
 
   WorkStorage& operator=(WorkStorage&& rhs)
   {
-    if (this != &rhs) {
-      move_assign_private(std::move(rhs), propagate_on_container_move_assignment{});
+    if (this != &rhs)
+    {
+      move_assign_private(std::move(rhs),
+                          propagate_on_container_move_assignment {});
     }
     return *this;
   }
@@ -312,33 +318,26 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   }
 
   // number of loops stored
-  size_type size() const
-  {
-    return m_vec.size();
-  }
+  size_type size() const { return m_vec.size(); }
 
-  const_iterator begin() const
-  {
-    return const_iterator(m_vec.begin());
-  }
+  const_iterator begin() const { return const_iterator(m_vec.begin()); }
 
-  const_iterator end() const
-  {
-    return const_iterator(m_vec.end());
-  }
+  const_iterator end() const { return const_iterator(m_vec.end()); }
 
   // number of bytes used for storage of loops
   size_type storage_size() const
   {
     size_type storage_size_nbytes = 0;
-    for (size_t i = 0; i < m_vec.size(); ++i) {
+    for (size_t i = 0; i < m_vec.size(); ++i)
+    {
       storage_size_nbytes += m_vec[i].size;
     }
     return storage_size_nbytes;
   }
 
-  template < typename holder, typename ... holder_ctor_args >
-  void emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
+  template <typename holder, typename... holder_ctor_args>
+  void emplace(const dispatcher_type* dispatcher,
+               holder_ctor_args&&... ctor_args)
   {
     m_vec.emplace_back(create_value<holder>(
         dispatcher, std::forward<holder_ctor_args>(ctor_args)...));
@@ -347,27 +346,28 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   // destroy all stored loops, deallocates all storage
   void clear()
   {
-    while (!m_vec.empty()) {
+    while (!m_vec.empty())
+    {
       destroy_value(m_vec.back());
       m_vec.pop_back();
     }
     m_vec.shrink_to_fit();
   }
 
-  ~WorkStorage()
-  {
-    clear();
-  }
+  ~WorkStorage() { clear(); }
 
 private:
-  RAJAVec<pointer_and_size, typename allocator_traits_type::template rebind_alloc<pointer_and_size>> m_vec;
+  RAJAVec<
+      pointer_and_size,
+      typename allocator_traits_type::template rebind_alloc<pointer_and_size>>
+      m_vec;
   allocator_type m_aloc;
 
   // move assignment if allocator propagates on move assignment
   void move_assign_private(WorkStorage&& rhs, std::true_type)
   {
     clear();
-    m_vec = std::move(rhs.m_vec);
+    m_vec  = std::move(rhs.m_vec);
     m_aloc = std::move(rhs.m_aloc);
   }
 
@@ -375,12 +375,16 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   void move_assign_private(WorkStorage&& rhs, std::false_type)
   {
     clear();
-    if (m_aloc == rhs.m_aloc) {
+    if (m_aloc == rhs.m_aloc)
+    {
       // take storage if allocators compare equal
       m_vec = std::move(rhs.m_vec);
-    } else {
+    }
+    else
+    {
       // allocate new storage if allocators do not compare equal
-      for (size_type i = 0; i < rhs.m_vec.size(); ++i) {
+      for (size_type i = 0; i < rhs.m_vec.size(); ++i)
+      {
         m_vec.emplace_back(move_destroy_value(std::move(rhs), rhs.m_vec[i]));
       }
       rhs.m_vec.clear();
@@ -389,7 +393,7 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   }
 
   // allocate and construct value in storage
-  template < typename holder, typename ... holder_ctor_args >
+  template <typename holder, typename... holder_ctor_args>
   pointer_and_size create_value(const dispatcher_type* dispatcher,
                                 holder_ctor_args&&... ctor_args)
   {
@@ -401,7 +405,7 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
     value_type::template construct<holder>(
         value_ptr, dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
 
-    return pointer_and_size{value_ptr, value_size};
+    return pointer_and_size {value_ptr, value_size};
   }
 
   // allocate and move construct object as copy of other value and
@@ -414,22 +418,24 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
 
     value_type::move_destroy(value_ptr, other_value_and_size.ptr);
 
-    allocator_traits_type::deallocate(rhs.m_aloc,
-        reinterpret_cast<char*>(other_value_and_size.ptr), other_value_and_size.size);
+    allocator_traits_type::deallocate(
+        rhs.m_aloc, reinterpret_cast<char*>(other_value_and_size.ptr),
+        other_value_and_size.size);
 
-    return pointer_and_size{value_ptr, other_value_and_size.size};
+    return pointer_and_size {value_ptr, other_value_and_size.size};
   }
 
   // destroy and deallocate value
   void destroy_value(pointer_and_size value_and_size_ptr)
   {
     value_type::destroy(value_and_size_ptr.ptr);
-    allocator_traits_type::deallocate(m_aloc,
-        reinterpret_cast<char*>(value_and_size_ptr.ptr), value_and_size_ptr.size);
+    allocator_traits_type::deallocate(
+        m_aloc, reinterpret_cast<char*>(value_and_size_ptr.ptr),
+        value_and_size_ptr.size);
   }
 };
 
-template < typename ALLOCATOR_T, typename Dispatcher_T >
+template <typename ALLOCATOR_T, typename Dispatcher_T>
 class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 {
   using allocator_traits_type = std::allocator_traits<ALLOCATOR_T>;
@@ -437,44 +443,45 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
       typename allocator_traits_type::propagate_on_container_copy_assignment;
   using propagate_on_container_move_assignment =
       typename allocator_traits_type::propagate_on_container_move_assignment;
-  using propagate_on_container_swap            =
+  using propagate_on_container_swap =
       typename allocator_traits_type::propagate_on_container_swap;
-  static_assert(std::is_same<typename allocator_traits_type::value_type, char>::value,
+  static_assert(
+      std::is_same<typename allocator_traits_type::value_type, char>::value,
       "WorkStorage expects an allocator for 'char's.");
+
 public:
-  using storage_policy = RAJA::ragged_array_of_objects;
+  using storage_policy  = RAJA::ragged_array_of_objects;
   using dispatcher_type = Dispatcher_T;
 
-  template < typename holder >
+  template <typename holder>
   using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
 
-  using value_type = GenericWorkStruct<dispatcher_type>;
-  using allocator_type = ALLOCATOR_T;
-  using size_type = std::size_t;
+  using value_type      = GenericWorkStruct<dispatcher_type>;
+  using allocator_type  = ALLOCATOR_T;
+  using size_type       = std::size_t;
   using difference_type = std::ptrdiff_t;
-  using reference = value_type&;
+  using reference       = value_type&;
   using const_reference = const value_type&;
-  using pointer = value_type*;
-  using const_pointer = const value_type*;
+  using pointer         = value_type*;
+  using const_pointer   = const value_type*;
 
-  // iterator base class for accessing stored WorkStructs outside of the container
+  // iterator base class for accessing stored WorkStructs outside of the
+  // container
   struct const_iterator_base
   {
-    using value_type = const typename WorkStorage::value_type;
-    using pointer = typename WorkStorage::const_pointer;
-    using reference = typename WorkStorage::const_reference;
-    using difference_type = typename WorkStorage::difference_type;
+    using value_type        = const typename WorkStorage::value_type;
+    using pointer           = typename WorkStorage::const_pointer;
+    using reference         = typename WorkStorage::const_reference;
+    using difference_type   = typename WorkStorage::difference_type;
     using iterator_category = std::random_access_iterator_tag;
 
     const_iterator_base(const char* array_begin, const size_type* offset_iter)
-      : m_array_begin(array_begin)
-      , m_offset_iter(offset_iter)
-    { }
+        : m_array_begin(array_begin), m_offset_iter(offset_iter)
+    {}
 
     RAJA_HOST_DEVICE reference operator*() const
     {
-      return *reinterpret_cast<pointer>(
-          m_array_begin + *m_offset_iter);
+      return *reinterpret_cast<pointer>(m_array_begin + *m_offset_iter);
     }
 
     RAJA_HOST_DEVICE const_iterator_base& operator+=(difference_type n)
@@ -483,20 +490,23 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
       return *this;
     }
 
-    RAJA_HOST_DEVICE friend inline difference_type operator-(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline difference_type
+    operator-(const_iterator_base const& lhs_iter,
+              const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_offset_iter - rhs_iter.m_offset_iter;
     }
 
-    RAJA_HOST_DEVICE friend inline bool operator==(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool
+    operator==(const_iterator_base const& lhs_iter,
+               const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_offset_iter == rhs_iter.m_offset_iter;
     }
 
-    RAJA_HOST_DEVICE friend inline bool operator<(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool
+    operator<(const_iterator_base const& lhs_iter,
+              const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_offset_iter < rhs_iter.m_offset_iter;
     }
@@ -510,29 +520,30 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 
 
   explicit WorkStorage(allocator_type const& aloc)
-    : m_offsets(0, aloc)
-    , m_aloc(aloc)
-  { }
+      : m_offsets(0, aloc), m_aloc(aloc)
+  {}
 
-  WorkStorage(WorkStorage const&) = delete;
+  WorkStorage(WorkStorage const&)            = delete;
   WorkStorage& operator=(WorkStorage const&) = delete;
 
   WorkStorage(WorkStorage&& rhs)
-    : m_offsets(std::move(rhs.m_offsets))
-    , m_array_begin(rhs.m_array_begin)
-    , m_array_end(rhs.m_array_end)
-    , m_array_cap(rhs.m_array_cap)
-    , m_aloc(std::move(rhs.m_aloc))
+      : m_offsets(std::move(rhs.m_offsets)),
+        m_array_begin(rhs.m_array_begin),
+        m_array_end(rhs.m_array_end),
+        m_array_cap(rhs.m_array_cap),
+        m_aloc(std::move(rhs.m_aloc))
   {
     rhs.m_array_begin = nullptr;
-    rhs.m_array_end = nullptr;
-    rhs.m_array_cap = nullptr;
+    rhs.m_array_end   = nullptr;
+    rhs.m_array_cap   = nullptr;
   }
 
   WorkStorage& operator=(WorkStorage&& rhs)
   {
-    if (this != &rhs) {
-      move_assign_private(std::move(rhs), propagate_on_container_move_assignment{});
+    if (this != &rhs)
+    {
+      move_assign_private(std::move(rhs),
+                          propagate_on_container_move_assignment {});
     }
     return *this;
   }
@@ -546,10 +557,7 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   }
 
   // number of loops stored
-  size_type size() const
-  {
-    return m_offsets.size();
-  }
+  size_type size() const { return m_offsets.size(); }
 
   const_iterator begin() const
   {
@@ -562,17 +570,15 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   }
 
   // number of bytes used for storage of loops
-  size_type storage_size() const
-  {
-    return m_array_end - m_array_begin;
-  }
+  size_type storage_size() const { return m_array_end - m_array_begin; }
 
-  template < typename holder, typename ... holder_ctor_args >
-  void emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
+  template <typename holder, typename... holder_ctor_args>
+  void emplace(const dispatcher_type* dispatcher,
+               holder_ctor_args&&... ctor_args)
   {
     size_type value_offset = storage_size();
-    size_type value_size   = create_value<holder>(value_offset,
-        dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
+    size_type value_size   = create_value<holder>(
+        value_offset, dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
     m_offsets.emplace_back(value_offset);
     m_array_end += value_size;
   }
@@ -581,21 +587,22 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   void clear()
   {
     array_clear();
-    if (m_array_begin != nullptr) {
-      allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity());
+    if (m_array_begin != nullptr)
+    {
+      allocator_traits_type::deallocate(m_aloc, m_array_begin,
+                                        storage_capacity());
       m_array_begin = nullptr;
       m_array_end   = nullptr;
       m_array_cap   = nullptr;
     }
   }
 
-  ~WorkStorage()
-  {
-    clear();
-  }
+  ~WorkStorage() { clear(); }
 
 private:
-  RAJAVec<size_type, typename allocator_traits_type::template rebind_alloc<size_type>> m_offsets;
+  RAJAVec<size_type,
+          typename allocator_traits_type::template rebind_alloc<size_type>>
+      m_offsets;
   char* m_array_begin = nullptr;
   char* m_array_end   = nullptr;
   char* m_array_cap   = nullptr;
@@ -608,8 +615,8 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 
     m_offsets     = std::move(rhs.m_offsets);
     m_array_begin = rhs.m_array_begin;
-    m_array_end   = rhs.m_array_end  ;
-    m_array_cap   = rhs.m_array_cap  ;
+    m_array_end   = rhs.m_array_end;
+    m_array_cap   = rhs.m_array_cap;
     m_aloc        = std::move(rhs.m_aloc);
 
     rhs.m_array_begin = nullptr;
@@ -621,25 +628,29 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   void move_assign_private(WorkStorage&& rhs, std::false_type)
   {
     clear();
-    if (m_aloc == rhs.m_aloc) {
+    if (m_aloc == rhs.m_aloc)
+    {
 
       m_offsets     = std::move(rhs.m_offsets);
       m_array_begin = rhs.m_array_begin;
-      m_array_end   = rhs.m_array_end  ;
-      m_array_cap   = rhs.m_array_cap  ;
+      m_array_end   = rhs.m_array_end;
+      m_array_cap   = rhs.m_array_cap;
 
       rhs.m_array_begin = nullptr;
       rhs.m_array_end   = nullptr;
       rhs.m_array_cap   = nullptr;
-    } else {
+    }
+    else
+    {
       array_reserve(rhs.storage_size());
 
-      for (size_type i = 0; i < rhs.size(); ++i) {
+      for (size_type i = 0; i < rhs.size(); ++i)
+      {
         m_array_end = m_array_begin + rhs.m_offsets[i];
         move_destroy_value(m_array_end, rhs.m_array_begin + rhs.m_offsets[i]);
         m_offsets.emplace_back(rhs.m_offsets[i]);
       }
-      m_array_end = m_array_begin + rhs.storage_size();
+      m_array_end     = m_array_begin + rhs.storage_size();
       rhs.m_array_end = rhs.m_array_begin;
       rhs.m_offsets.clear();
       rhs.clear();
@@ -647,46 +658,45 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   }
 
   // get loop storage capacity, used and unused in bytes
-  size_type storage_capacity() const
-  {
-    return m_array_cap - m_array_begin;
-  }
+  size_type storage_capacity() const { return m_array_cap - m_array_begin; }
 
   // get unused loop storage capacity in bytes
-  size_type storage_unused() const
-  {
-    return m_array_cap - m_array_end;
-  }
+  size_type storage_unused() const { return m_array_cap - m_array_end; }
 
   // reserve space for loop_storage_size bytes of loop storage
   void array_reserve(size_type loop_storage_size)
   {
-    if (loop_storage_size > storage_capacity()) {
+    if (loop_storage_size > storage_capacity())
+    {
 
       char* new_array_begin =
           allocator_traits_type::allocate(m_aloc, loop_storage_size);
-      char* new_array_end   = new_array_begin + storage_size();
-      char* new_array_cap   = new_array_begin + loop_storage_size;
+      char* new_array_end = new_array_begin + storage_size();
+      char* new_array_cap = new_array_begin + loop_storage_size;
 
-      for (size_type i = 0; i < size(); ++i) {
+      for (size_type i = 0; i < size(); ++i)
+      {
         move_destroy_value(new_array_begin + m_offsets[i],
-                             m_array_begin + m_offsets[i]);
+                           m_array_begin + m_offsets[i]);
       }
 
-      if (m_array_begin != nullptr) {
-        allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity());
+      if (m_array_begin != nullptr)
+      {
+        allocator_traits_type::deallocate(m_aloc, m_array_begin,
+                                          storage_capacity());
       }
 
       m_array_begin = new_array_begin;
-      m_array_end   = new_array_end  ;
-      m_array_cap   = new_array_cap  ;
+      m_array_end   = new_array_end;
+      m_array_cap   = new_array_cap;
     }
   }
 
   // destroy loop objects (does not deallocate array storage)
   void array_clear()
   {
-    while (!m_offsets.empty()) {
+    while (!m_offsets.empty())
+    {
       destroy_value(m_offsets.back());
       m_array_end = m_array_begin + m_offsets.back();
       m_offsets.pop_back();
@@ -696,15 +706,17 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 
   // ensure there is enough storage to hold the next loop body at value offset
   // and store the loop body
-  template < typename holder, typename ... holder_ctor_args >
+  template <typename holder, typename... holder_ctor_args>
   size_type create_value(size_type value_offset,
                          const dispatcher_type* dispatcher,
                          holder_ctor_args&&... ctor_args)
   {
     const size_type value_size = sizeof(true_value_type<holder>);
 
-    if (value_size > storage_unused()) {
-      array_reserve(std::max(storage_size() + value_size, 2*storage_capacity()));
+    if (value_size > storage_unused())
+    {
+      array_reserve(
+          std::max(storage_size() + value_size, 2 * storage_capacity()));
     }
 
     pointer value_ptr = reinterpret_cast<pointer>(m_array_begin + value_offset);
@@ -726,13 +738,12 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   // destroy the loop body at value offset
   void destroy_value(size_type value_offset)
   {
-    pointer value_ptr =
-        reinterpret_cast<pointer>(m_array_begin + value_offset);
+    pointer value_ptr = reinterpret_cast<pointer>(m_array_begin + value_offset);
     value_type::destroy(value_ptr);
   }
 };
 
-template < typename ALLOCATOR_T, typename Dispatcher_T >
+template <typename ALLOCATOR_T, typename Dispatcher_T>
 class WorkStorage<RAJA::constant_stride_array_of_objects,
                   ALLOCATOR_T,
                   Dispatcher_T>
@@ -742,39 +753,41 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
       typename allocator_traits_type::propagate_on_container_copy_assignment;
   using propagate_on_container_move_assignment =
       typename allocator_traits_type::propagate_on_container_move_assignment;
-  using propagate_on_container_swap            =
+  using propagate_on_container_swap =
       typename allocator_traits_type::propagate_on_container_swap;
-  static_assert(std::is_same<typename allocator_traits_type::value_type, char>::value,
+  static_assert(
+      std::is_same<typename allocator_traits_type::value_type, char>::value,
       "WorkStorage expects an allocator for 'char's.");
+
 public:
-  using storage_policy = RAJA::constant_stride_array_of_objects;
+  using storage_policy  = RAJA::constant_stride_array_of_objects;
   using dispatcher_type = Dispatcher_T;
 
-  template < typename holder >
+  template <typename holder>
   using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
 
-  using value_type = GenericWorkStruct<dispatcher_type>;
-  using allocator_type = ALLOCATOR_T;
-  using size_type = std::size_t;
+  using value_type      = GenericWorkStruct<dispatcher_type>;
+  using allocator_type  = ALLOCATOR_T;
+  using size_type       = std::size_t;
   using difference_type = std::ptrdiff_t;
-  using reference = value_type&;
+  using reference       = value_type&;
   using const_reference = const value_type&;
-  using pointer = value_type*;
-  using const_pointer = const value_type*;
+  using pointer         = value_type*;
+  using const_pointer   = const value_type*;
 
-  // iterator base class for accessing stored WorkStructs outside of the container
+  // iterator base class for accessing stored WorkStructs outside of the
+  // container
   struct const_iterator_base
   {
-    using value_type = const typename WorkStorage::value_type;
-    using pointer = typename WorkStorage::const_pointer;
-    using reference = typename WorkStorage::const_reference;
-    using difference_type = typename WorkStorage::difference_type;
+    using value_type        = const typename WorkStorage::value_type;
+    using pointer           = typename WorkStorage::const_pointer;
+    using reference         = typename WorkStorage::const_reference;
+    using difference_type   = typename WorkStorage::difference_type;
     using iterator_category = std::random_access_iterator_tag;
 
     const_iterator_base(const char* array_pos, size_type stride)
-      : m_array_pos(array_pos)
-      , m_stride(stride)
-    { }
+        : m_array_pos(array_pos), m_stride(stride)
+    {}
 
     RAJA_HOST_DEVICE reference operator*() const
     {
@@ -787,20 +800,23 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
       return *this;
     }
 
-    RAJA_HOST_DEVICE friend inline difference_type operator-(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline difference_type
+    operator-(const_iterator_base const& lhs_iter,
+              const_iterator_base const& rhs_iter)
     {
       return (lhs_iter.m_array_pos - rhs_iter.m_array_pos) / lhs_iter.m_stride;
     }
 
-    RAJA_HOST_DEVICE friend inline bool operator==(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool
+    operator==(const_iterator_base const& lhs_iter,
+               const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_array_pos == rhs_iter.m_array_pos;
     }
 
-    RAJA_HOST_DEVICE friend inline bool operator<(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool
+    operator<(const_iterator_base const& lhs_iter,
+              const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_array_pos < rhs_iter.m_array_pos;
     }
@@ -813,19 +829,17 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   using const_iterator = random_access_iterator<const_iterator_base>;
 
 
-  explicit WorkStorage(allocator_type const& aloc)
-    : m_aloc(aloc)
-  { }
+  explicit WorkStorage(allocator_type const& aloc) : m_aloc(aloc) {}
 
-  WorkStorage(WorkStorage const&) = delete;
+  WorkStorage(WorkStorage const&)            = delete;
   WorkStorage& operator=(WorkStorage const&) = delete;
 
   WorkStorage(WorkStorage&& rhs)
-    : m_aloc(std::move(rhs.m_aloc))
-    , m_stride(rhs.m_stride)
-    , m_array_begin(rhs.m_array_begin)
-    , m_array_end(rhs.m_array_end)
-    , m_array_cap(rhs.m_array_cap)
+      : m_aloc(std::move(rhs.m_aloc)),
+        m_stride(rhs.m_stride),
+        m_array_begin(rhs.m_array_begin),
+        m_array_end(rhs.m_array_end),
+        m_array_cap(rhs.m_array_cap)
   {
     // do not reset stride, leave it for reuse
     rhs.m_array_begin = nullptr;
@@ -835,8 +849,10 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
 
   WorkStorage& operator=(WorkStorage&& rhs)
   {
-    if (this != &rhs) {
-      move_assign_private(std::move(rhs), propagate_on_container_move_assignment{});
+    if (this != &rhs)
+    {
+      move_assign_private(std::move(rhs),
+                          propagate_on_container_move_assignment {});
     }
     return *this;
   }
@@ -847,35 +863,28 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   {
     size_type num_storage_loops =
         std::max(num_loops, (loop_storage_size + m_stride - 1) / m_stride);
-    array_reserve(num_storage_loops*m_stride, m_stride);
+    array_reserve(num_storage_loops * m_stride, m_stride);
   }
 
   // number of loops stored
-  size_type size() const
-  {
-    return storage_size() / m_stride;
-  }
+  size_type size() const { return storage_size() / m_stride; }
 
   const_iterator begin() const
   {
     return const_iterator(m_array_begin, m_stride);
   }
 
-  const_iterator end() const
-  {
-    return const_iterator(m_array_end, m_stride);
-  }
+  const_iterator end() const { return const_iterator(m_array_end, m_stride); }
 
   // amount of storage in bytes used to store loops
-  size_type storage_size() const
-  {
-    return m_array_end - m_array_begin;
-  }
+  size_type storage_size() const { return m_array_end - m_array_begin; }
 
-  template < typename holder, typename ... holder_ctor_args >
-  void emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
+  template <typename holder, typename... holder_ctor_args>
+  void emplace(const dispatcher_type* dispatcher,
+               holder_ctor_args&&... ctor_args)
   {
-    create_value<holder>(dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
+    create_value<holder>(dispatcher,
+                         std::forward<holder_ctor_args>(ctor_args)...);
     m_array_end += m_stride;
   }
 
@@ -883,22 +892,21 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   void clear()
   {
     array_clear();
-    if (m_array_begin != nullptr) {
-      allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity());
+    if (m_array_begin != nullptr)
+    {
+      allocator_traits_type::deallocate(m_aloc, m_array_begin,
+                                        storage_capacity());
       m_array_begin = nullptr;
       m_array_end   = nullptr;
       m_array_cap   = nullptr;
     }
   }
 
-  ~WorkStorage()
-  {
-    clear();
-  }
+  ~WorkStorage() { clear(); }
 
 private:
   allocator_type m_aloc;
-  size_type m_stride     = 1; // can't be 0 because size divides stride
+  size_type m_stride  = 1;  // can't be 0 because size divides stride
   char* m_array_begin = nullptr;
   char* m_array_end   = nullptr;
   char* m_array_cap   = nullptr;
@@ -909,10 +917,10 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
     clear();
 
     m_aloc        = std::move(rhs.m_aloc);
-    m_stride      = rhs.m_stride     ;
+    m_stride      = rhs.m_stride;
     m_array_begin = rhs.m_array_begin;
-    m_array_end   = rhs.m_array_end  ;
-    m_array_cap   = rhs.m_array_cap  ;
+    m_array_end   = rhs.m_array_end;
+    m_array_cap   = rhs.m_array_cap;
 
     // do not reset stride, leave it for reuse
     rhs.m_array_begin = nullptr;
@@ -924,23 +932,27 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   void move_assign_private(WorkStorage&& rhs, std::false_type)
   {
     clear();
-    if (m_aloc == rhs.m_aloc) {
+    if (m_aloc == rhs.m_aloc)
+    {
 
-      m_stride      = rhs.m_stride     ;
+      m_stride      = rhs.m_stride;
       m_array_begin = rhs.m_array_begin;
-      m_array_end   = rhs.m_array_end  ;
-      m_array_cap   = rhs.m_array_cap  ;
+      m_array_end   = rhs.m_array_end;
+      m_array_cap   = rhs.m_array_cap;
 
       // do not reset stride, leave it for reuse
       rhs.m_array_begin = nullptr;
       rhs.m_array_end   = nullptr;
       rhs.m_array_cap   = nullptr;
-    } else {
+    }
+    else
+    {
 
       m_stride = rhs.m_stride;
       array_reserve(rhs.storage_size(), rhs.m_stride);
 
-      for (size_type i = 0; i < rhs.size(); ++i) {
+      for (size_type i = 0; i < rhs.size(); ++i)
+      {
         move_destroy_value(m_array_end, rhs.m_array_begin + i * rhs.m_stride);
         m_array_end += m_stride;
       }
@@ -950,16 +962,10 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   }
 
   // storage capacity, used and unused, in bytes
-  size_type storage_capacity() const
-  {
-    return m_array_cap - m_array_begin;
-  }
+  size_type storage_capacity() const { return m_array_cap - m_array_begin; }
 
   // unused storage capacity in bytes
-  size_type storage_unused() const
-  {
-    return m_array_cap - m_array_end;
-  }
+  size_type storage_unused() const { return m_array_cap - m_array_end; }
 
   // allocate enough storage for loop_storage_size bytes with
   // each loop body separated by new_stride bytes
@@ -968,33 +974,39 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   // Note that loop_storage_size must be a multiple of new_stride
   void array_reserve(size_type loop_storage_size, size_type new_stride)
   {
-    if (loop_storage_size > storage_capacity() || new_stride > m_stride) {
+    if (loop_storage_size > storage_capacity() || new_stride > m_stride)
+    {
 
       char* new_array_begin =
           allocator_traits_type::allocate(m_aloc, loop_storage_size);
-      char* new_array_end   = new_array_begin + size() * new_stride;
-      char* new_array_cap   = new_array_begin + loop_storage_size;
+      char* new_array_end = new_array_begin + size() * new_stride;
+      char* new_array_cap = new_array_begin + loop_storage_size;
 
-      for (size_type i = 0; i < size(); ++i) {
+      for (size_type i = 0; i < size(); ++i)
+      {
         move_destroy_value(new_array_begin + i * new_stride,
-                             m_array_begin + i *   m_stride);
+                           m_array_begin + i * m_stride);
       }
 
-      if (m_array_begin != nullptr) {
-        allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity());
+      if (m_array_begin != nullptr)
+      {
+        allocator_traits_type::deallocate(m_aloc, m_array_begin,
+                                          storage_capacity());
       }
 
-      m_stride      = new_stride     ;
+      m_stride      = new_stride;
       m_array_begin = new_array_begin;
-      m_array_end   = new_array_end  ;
-      m_array_cap   = new_array_cap  ;
+      m_array_end   = new_array_end;
+      m_array_cap   = new_array_cap;
     }
   }
 
   // destroy the loops in storage (does not deallocate loop storage)
   void array_clear()
   {
-    for (size_type value_offset = storage_size(); value_offset > 0; value_offset -= m_stride) {
+    for (size_type value_offset = storage_size(); value_offset > 0;
+         value_offset -= m_stride)
+    {
       destroy_value(value_offset - m_stride);
       m_array_end -= m_stride;
     }
@@ -1002,18 +1014,20 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
 
   // ensure there is enough storage to store the loop body
   // and construct the body in storage.
-  template < typename holder, typename ... holder_ctor_args >
+  template <typename holder, typename... holder_ctor_args>
   void create_value(const dispatcher_type* dispatcher,
                     holder_ctor_args&&... ctor_args)
   {
     const size_type value_size = sizeof(true_value_type<holder>);
 
-    if (value_size > storage_unused() && value_size <= m_stride) {
-      array_reserve(std::max(storage_size() + m_stride, 2*storage_capacity()),
+    if (value_size > storage_unused() && value_size <= m_stride)
+    {
+      array_reserve(std::max(storage_size() + m_stride, 2 * storage_capacity()),
                     m_stride);
-    } else if (value_size > m_stride) {
-      array_reserve((size()+1)*value_size,
-                    value_size);
+    }
+    else if (value_size > m_stride)
+    {
+      array_reserve((size() + 1) * value_size, value_size);
     }
 
     size_type value_offset = storage_size();
@@ -1025,8 +1039,7 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
 
   // move construct the loop body in value from other and
   // destroy the loop body in other
-  void move_destroy_value(char* value_ptr,
-                          char* other_value_ptr)
+  void move_destroy_value(char* value_ptr, char* other_value_ptr)
   {
     value_type::move_destroy(reinterpret_cast<pointer>(value_ptr),
                              reinterpret_cast<pointer>(other_value_ptr));
@@ -1035,8 +1048,7 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   // destroy the loop body at value offset
   void destroy_value(size_type value_offset)
   {
-    pointer value_ptr =
-        reinterpret_cast<pointer>(m_array_begin + value_offset);
+    pointer value_ptr = reinterpret_cast<pointer>(m_array_begin + value_offset);
     value_type::destroy(value_ptr);
   }
 };
diff --git a/include/RAJA/pattern/WorkGroup/WorkStruct.hpp b/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
index 72e1540c54..90792d4037 100644
--- a/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
@@ -35,7 +35,7 @@ namespace detail
 /*!
  * A struct that gives a generic way to layout memory for different loops
  */
-template < size_t size, typename Dispatcher_T >
+template <size_t size, typename Dispatcher_T>
 struct WorkStruct;
 
 /*!
@@ -44,67 +44,75 @@ struct WorkStruct;
  *   offsetof(GenericWorkStruct<>, obj) == offsetof(WorkStruct<size>, obj)
  *   sizeof(GenericWorkStruct) <= sizeof(WorkStruct<size>)
  */
-template < typename Dispatcher_T >
+template <typename Dispatcher_T>
 using GenericWorkStruct = WorkStruct<RAJA_MAX_ALIGN, Dispatcher_T>;
 
-template < size_t size, Platform platform, typename dispatch_policy, typename DispatcherID, typename ... CallArgs >
-struct WorkStruct<size, Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>>
+template <size_t size,
+          Platform platform,
+          typename dispatch_policy,
+          typename DispatcherID,
+          typename... CallArgs>
+struct WorkStruct<
+    size,
+    Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>>
 {
-  using dispatcher_type = Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>;
+  using dispatcher_type =
+      Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>;
 
   // construct a WorkStruct with a value of type holder from the args and
   // check a variety of constraints at compile time
-  template < typename holder, typename ... holder_ctor_args >
-  static RAJA_INLINE
-  void construct(void* ptr, const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
+  template <typename holder, typename... holder_ctor_args>
+  static RAJA_INLINE void construct(void* ptr,
+                                    const dispatcher_type* dispatcher,
+                                    holder_ctor_args&&... ctor_args)
   {
     using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
-    using value_type = GenericWorkStruct<dispatcher_type>;
+    using value_type      = GenericWorkStruct<dispatcher_type>;
 
     static_assert(sizeof(holder) <= sizeof(true_value_type::obj),
-        "holder must fit in WorkStruct::obj");
+                  "holder must fit in WorkStruct::obj");
     static_assert(std::is_standard_layout<true_value_type>::value,
-        "WorkStruct must be a standard layout type");
+                  "WorkStruct must be a standard layout type");
     static_assert(std::is_standard_layout<value_type>::value,
-        "GenericWorkStruct must be a standard layout type");
-    static_assert(offsetof(value_type, obj) == offsetof(true_value_type, obj),
+                  "GenericWorkStruct must be a standard layout type");
+    static_assert(
+        offsetof(value_type, obj) == offsetof(true_value_type, obj),
         "WorkStruct and GenericWorkStruct must have obj at the same offset");
     static_assert(sizeof(value_type) <= sizeof(true_value_type),
-        "WorkStruct must not be smaller than GenericWorkStruct");
+                  "WorkStruct must not be smaller than GenericWorkStruct");
     true_value_type* value_ptr = static_cast<true_value_type*>(ptr);
 
     value_ptr->dispatcher = dispatcher;
-    value_ptr->invoke = dispatcher->invoke;
-    new(&value_ptr->obj) holder(std::forward<holder_ctor_args>(ctor_args)...);
+    value_ptr->invoke     = dispatcher->invoke;
+    new (&value_ptr->obj) holder(std::forward<holder_ctor_args>(ctor_args)...);
   }
 
   // move construct in dst from the value in src and destroy the value in src
-  static RAJA_INLINE
-  void move_destroy(WorkStruct* value_dst,
-                    WorkStruct* value_src)
+  static RAJA_INLINE void move_destroy(WorkStruct* value_dst,
+                                       WorkStruct* value_src)
   {
     value_dst->dispatcher = value_src->dispatcher;
-    value_dst->invoke = value_src->invoke;
-    value_dst->dispatcher->move_construct_destroy(&value_dst->obj, &value_src->obj);
+    value_dst->invoke     = value_src->invoke;
+    value_dst->dispatcher->move_construct_destroy(&value_dst->obj,
+                                                  &value_src->obj);
   }
 
   // destroy the value ptr
-  static RAJA_INLINE
-  void destroy(WorkStruct* value_ptr)
+  static RAJA_INLINE void destroy(WorkStruct* value_ptr)
   {
     value_ptr->dispatcher->destroy(&value_ptr->obj);
   }
 
   // invoke the call operator of the value ptr with args
-  static RAJA_INLINE
-  void host_call(const WorkStruct* value_ptr, CallArgs... args)
+  static RAJA_INLINE void host_call(const WorkStruct* value_ptr,
+                                    CallArgs... args)
   {
     value_ptr->invoke(&value_ptr->obj, std::forward<CallArgs>(args)...);
   }
   ///
   // invoke the call operator of the value ptr with args
-  static RAJA_DEVICE RAJA_INLINE
-  void device_call(const WorkStruct* value_ptr, CallArgs... args)
+  static RAJA_DEVICE RAJA_INLINE void device_call(const WorkStruct* value_ptr,
+                                                  CallArgs... args)
   {
     value_ptr->invoke(&value_ptr->obj, std::forward<CallArgs>(args)...);
   }
diff --git a/include/RAJA/pattern/atomic.hpp b/include/RAJA/pattern/atomic.hpp
index d5905f7928..d56c576710 100644
--- a/include/RAJA/pattern/atomic.hpp
+++ b/include/RAJA/pattern/atomic.hpp
@@ -87,9 +87,9 @@ namespace RAJA
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(T* acc)
 {
-  return RAJA::atomicLoad(Policy{}, acc);
+  return RAJA::atomicLoad(Policy {}, acc);
 }
 
 
@@ -100,9 +100,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(T *acc)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(T* acc, T value)
 {
-  RAJA::atomicStore(Policy{}, acc, value);
+  RAJA::atomicStore(Policy {}, acc, value);
 }
 
 
@@ -114,9 +114,9 @@ RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(T *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T* acc, T value)
 {
-  return RAJA::atomicAdd(Policy{}, acc, value);
+  return RAJA::atomicAdd(Policy {}, acc, value);
 }
 
 
@@ -128,9 +128,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(T* acc, T value)
 {
-  return RAJA::atomicSub(Policy{}, acc, value);
+  return RAJA::atomicSub(Policy {}, acc, value);
 }
 
 
@@ -142,9 +142,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(T *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T* acc, T value)
 {
-  return RAJA::atomicMin(Policy{}, acc, value);
+  return RAJA::atomicMin(Policy {}, acc, value);
 }
 
 
@@ -156,9 +156,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(T* acc, T value)
 {
-  return RAJA::atomicMax(Policy{}, acc, value);
+  return RAJA::atomicMax(Policy {}, acc, value);
 }
 
 
@@ -169,9 +169,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(T *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T* acc)
 {
-  return RAJA::atomicInc(Policy{}, acc);
+  return RAJA::atomicInc(Policy {}, acc);
 }
 
 
@@ -185,9 +185,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc, T compare)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T* acc, T compare)
 {
-  return RAJA::atomicInc(Policy{}, acc, compare);
+  return RAJA::atomicInc(Policy {}, acc, compare);
 }
 
 
@@ -198,9 +198,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc, T compare)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T* acc)
 {
-  return RAJA::atomicDec(Policy{}, acc);
+  return RAJA::atomicDec(Policy {}, acc);
 }
 
 
@@ -214,9 +214,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T *acc)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T *acc, T compare)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T* acc, T compare)
 {
-  return RAJA::atomicDec(Policy{}, acc, compare);
+  return RAJA::atomicDec(Policy {}, acc, compare);
 }
 
 
@@ -229,11 +229,11 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T *acc, T compare)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T* acc, T value)
 {
   static_assert(std::is_integral<T>::value,
                 "atomicAnd can only be used on integral types");
-  return RAJA::atomicAnd(Policy{}, acc, value);
+  return RAJA::atomicAnd(Policy {}, acc, value);
 }
 
 
@@ -246,11 +246,11 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T* acc, T value)
 {
   static_assert(std::is_integral<T>::value,
                 "atomicOr can only be used on integral types");
-  return RAJA::atomicOr(Policy{}, acc, value);
+  return RAJA::atomicOr(Policy {}, acc, value);
 }
 
 
@@ -263,11 +263,11 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(T* acc, T value)
 {
   static_assert(std::is_integral<T>::value,
                 "atomicXor can only be used on integral types");
-  return RAJA::atomicXor(Policy{}, acc, value);
+  return RAJA::atomicXor(Policy {}, acc, value);
 }
 
 
@@ -279,9 +279,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(T *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(T* acc, T value)
 {
-  return RAJA::atomicExchange(Policy{}, acc, value);
+  return RAJA::atomicExchange(Policy {}, acc, value);
 }
 
 
@@ -295,9 +295,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicCAS(T *acc, T compare, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicCAS(T* acc, T compare, T value)
 {
-  return RAJA::atomicCAS(Policy{}, acc, compare, value);
+  return RAJA::atomicCAS(Policy {}, acc, compare, value);
 }
 
 /*!
@@ -317,22 +317,18 @@ class AtomicRef
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr explicit AtomicRef(value_type *value_ptr)
-      : m_value_ptr(value_ptr) {}
+  constexpr explicit AtomicRef(value_type* value_ptr) : m_value_ptr(value_ptr)
+  {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr AtomicRef(AtomicRef const &c)
-      : m_value_ptr(c.m_value_ptr) {}
+  constexpr AtomicRef(AtomicRef const& c) : m_value_ptr(c.m_value_ptr) {}
 
   AtomicRef& operator=(AtomicRef const&) = delete;
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  value_type * getPointer() const
-  {
-    return m_value_ptr;
-  }
+  value_type* getPointer() const { return m_value_ptr; }
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
@@ -351,17 +347,11 @@ class AtomicRef
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  value_type load() const
-  {
-    return RAJA::atomicLoad<Policy>(m_value_ptr);
-  }
+  value_type load() const { return RAJA::atomicLoad<Policy>(m_value_ptr); }
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  operator value_type() const
-  {
-    return RAJA::atomicLoad<Policy>(m_value_ptr);
-  }
+  operator value_type() const { return RAJA::atomicLoad<Policy>(m_value_ptr); }
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
@@ -382,10 +372,13 @@ class AtomicRef
   bool compare_exchange_strong(value_type& expect, value_type rhs) const
   {
     value_type compare = expect;
-    value_type old = RAJA::atomicCAS<Policy>(m_value_ptr, compare, rhs);
-    if (compare == old) {
+    value_type old     = RAJA::atomicCAS<Policy>(m_value_ptr, compare, rhs);
+    if (compare == old)
+    {
       return true;
-    } else {
+    }
+    else
+    {
       expect = old;
       return false;
     }
@@ -527,7 +520,7 @@ class AtomicRef
   }
 
 private:
-  value_type *m_value_ptr;
+  value_type* m_value_ptr;
 };
 
 
diff --git a/include/RAJA/pattern/detail/algorithm.hpp b/include/RAJA/pattern/detail/algorithm.hpp
index 21d266bd21..0a5521e0e3 100644
--- a/include/RAJA/pattern/detail/algorithm.hpp
+++ b/include/RAJA/pattern/detail/algorithm.hpp
@@ -49,16 +49,17 @@ using ContainerVal =
     camp::decay<decltype(*camp::val<camp::iterator_from<Container>>())>;
 
 template <typename Container>
-using ContainerRef =
-    decltype(*camp::val<camp::iterator_from<Container>>());
+using ContainerRef = decltype(*camp::val<camp::iterator_from<Container>>());
 
 template <typename Container>
 using ContainerDiff =
-    camp::decay<decltype(camp::val<camp::iterator_from<Container>>()-camp::val<camp::iterator_from<Container>>())>;
+    camp::decay<decltype(camp::val<camp::iterator_from<Container>>() -
+                         camp::val<camp::iterator_from<Container>>())>;
 
 template <typename DiffType, typename CountType>
-RAJA_INLINE
-DiffType firstIndex(DiffType n, CountType num_threads, CountType thread_id)
+RAJA_INLINE DiffType firstIndex(DiffType n,
+                                CountType num_threads,
+                                CountType thread_id)
 {
   return (static_cast<size_t>(n) * thread_id) / num_threads;
 }
@@ -70,9 +71,7 @@ DiffType firstIndex(DiffType n, CountType num_threads, CountType thread_id)
     \brief swap values at iterators lhs and rhs
 */
 template <typename Iter>
-RAJA_HOST_DEVICE RAJA_INLINE
-void
-safe_iter_swap(Iter lhs, Iter rhs)
+RAJA_HOST_DEVICE RAJA_INLINE void safe_iter_swap(Iter lhs, Iter rhs)
 {
 #ifdef RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE
   using camp::safe_swap;
@@ -87,9 +86,7 @@ safe_iter_swap(Iter lhs, Iter rhs)
     \brief returns iterator to next item
 */
 template <typename Iter>
-RAJA_HOST_DEVICE RAJA_INLINE
-Iter
-next(Iter it)
+RAJA_HOST_DEVICE RAJA_INLINE Iter next(Iter it)
 {
   ++it;
   return it;
@@ -99,9 +96,7 @@ next(Iter it)
     \brief returns iterator to next item
 */
 template <typename Iter>
-RAJA_HOST_DEVICE RAJA_INLINE
-Iter
-prev(Iter it)
+RAJA_HOST_DEVICE RAJA_INLINE Iter prev(Iter it)
 {
   --it;
   return it;
diff --git a/include/RAJA/pattern/detail/forall.hpp b/include/RAJA/pattern/detail/forall.hpp
index 3bd5d7ecaf..aa9a3ac888 100644
--- a/include/RAJA/pattern/detail/forall.hpp
+++ b/include/RAJA/pattern/detail/forall.hpp
@@ -19,12 +19,12 @@
 #ifndef RAJA_PATTERN_DETAIL_FORALL_HPP
 #define RAJA_PATTERN_DETAIL_FORALL_HPP
 
-#define RAJA_EXTRACT_BED_SUFFIXED(CONTAINER, SUFFIX) \
-  using std::begin;                                  \
-  using std::end;                                    \
-  using std::distance;                               \
-  auto begin##SUFFIX = begin(CONTAINER);             \
-  auto end##SUFFIX = end(CONTAINER);                 \
+#define RAJA_EXTRACT_BED_SUFFIXED(CONTAINER, SUFFIX)                           \
+  using std::begin;                                                            \
+  using std::end;                                                              \
+  using std::distance;                                                         \
+  auto begin##SUFFIX    = begin(CONTAINER);                                    \
+  auto end##SUFFIX      = end(CONTAINER);                                      \
   auto distance##SUFFIX = distance(begin##SUFFIX, end##SUFFIX)
 
 #define RAJA_EXTRACT_BED_IT(CONTAINER) RAJA_EXTRACT_BED_SUFFIXED(CONTAINER, _it)
diff --git a/include/RAJA/pattern/detail/multi_reduce.hpp b/include/RAJA/pattern/detail/multi_reduce.hpp
index 884b9aa989..14b655475b 100644
--- a/include/RAJA/pattern/detail/multi_reduce.hpp
+++ b/include/RAJA/pattern/detail/multi_reduce.hpp
@@ -26,32 +26,29 @@
 #include "RAJA/util/RepeatView.hpp"
 
 
-#define RAJA_DECLARE_MULTI_REDUCER(OP_NAME, OP, POL, DATA)    \
-  template <typename tuning, typename T>                      \
-  struct MultiReduce##OP_NAME<POL<tuning>, T>                 \
-      : reduce::detail::BaseMultiReduce##OP_NAME<             \
-            DATA<T, RAJA::reduce::OP<T>, tuning>>             \
-  {                                                           \
-    using policy = POL<tuning>;                               \
-    using Base = reduce::detail::BaseMultiReduce##OP_NAME<    \
-        DATA<T, RAJA::reduce::OP<T>, tuning>>;                \
-    using Base::Base;                                         \
-    using typename Base::value_type;                          \
-    using typename Base::reference;                           \
-                                                              \
-    RAJA_SUPPRESS_HD_WARN                                     \
-    RAJA_HOST_DEVICE                                          \
-    reference operator[](size_t bin) const                    \
-    {                                                         \
-      return reference(*this, bin);                           \
-    }                                                         \
+#define RAJA_DECLARE_MULTI_REDUCER(OP_NAME, OP, POL, DATA)                     \
+  template <typename tuning, typename T>                                       \
+  struct MultiReduce##OP_NAME<POL<tuning>, T>                                  \
+      : reduce::detail::BaseMultiReduce##OP_NAME<                              \
+            DATA<T, RAJA::reduce::OP<T>, tuning>>                              \
+  {                                                                            \
+    using policy = POL<tuning>;                                                \
+    using Base   = reduce::detail::BaseMultiReduce##OP_NAME<                   \
+        DATA<T, RAJA::reduce::OP<T>, tuning>>;                               \
+    using Base::Base;                                                          \
+    using typename Base::value_type;                                           \
+    using typename Base::reference;                                            \
+                                                                               \
+    RAJA_SUPPRESS_HD_WARN                                                      \
+    RAJA_HOST_DEVICE                                                           \
+    reference operator[](size_t bin) const { return reference(*this, bin); }   \
   };
 
-#define RAJA_DECLARE_ALL_MULTI_REDUCERS(POL, DATA)            \
-  RAJA_DECLARE_MULTI_REDUCER(Sum, sum, POL, DATA)             \
-  RAJA_DECLARE_MULTI_REDUCER(Min, min, POL, DATA)             \
-  RAJA_DECLARE_MULTI_REDUCER(Max, max, POL, DATA)             \
-  RAJA_DECLARE_MULTI_REDUCER(BitOr, or_bit, POL, DATA)        \
+#define RAJA_DECLARE_ALL_MULTI_REDUCERS(POL, DATA)                             \
+  RAJA_DECLARE_MULTI_REDUCER(Sum, sum, POL, DATA)                              \
+  RAJA_DECLARE_MULTI_REDUCER(Min, min, POL, DATA)                              \
+  RAJA_DECLARE_MULTI_REDUCER(Max, max, POL, DATA)                              \
+  RAJA_DECLARE_MULTI_REDUCER(BitOr, or_bit, POL, DATA)                         \
   RAJA_DECLARE_MULTI_REDUCER(BitAnd, and_bit, POL, DATA)
 
 namespace RAJA
@@ -67,32 +64,37 @@ template <typename t_MultiReduceData>
 struct BaseMultiReduce
 {
   using MultiReduceData = t_MultiReduceData;
-  using MultiReduceOp = typename t_MultiReduceData::MultiReduceOp;
-  using value_type = typename t_MultiReduceData::value_type;
+  using MultiReduceOp   = typename t_MultiReduceData::MultiReduceOp;
+  using value_type      = typename t_MultiReduceData::value_type;
 
-  BaseMultiReduce() : BaseMultiReduce{RepeatView<value_type>(MultiReduceOp::identity(), 0)} {}
+  BaseMultiReduce()
+      : BaseMultiReduce {RepeatView<value_type>(MultiReduceOp::identity(), 0)}
+  {}
 
   explicit BaseMultiReduce(size_t num_bins,
                            value_type init_val = MultiReduceOp::identity(),
                            value_type identity = MultiReduceOp::identity())
-      : BaseMultiReduce{RepeatView<value_type>(init_val, num_bins), identity}
-  { }
-
-  template < typename Container,
-             concepts::enable_if_t<type_traits::is_range<Container>,
-                                   concepts::negate<std::is_convertible<Container, size_t>>,
-                                   concepts::negate<std::is_base_of<BaseMultiReduce, Container>>>* = nullptr >
+      : BaseMultiReduce {RepeatView<value_type>(init_val, num_bins), identity}
+  {}
+
+  template <
+      typename Container,
+      concepts::enable_if_t<
+          type_traits::is_range<Container>,
+          concepts::negate<std::is_convertible<Container, size_t>>,
+          concepts::negate<std::is_base_of<BaseMultiReduce, Container>>>* =
+          nullptr>
   explicit BaseMultiReduce(Container const& container,
                            value_type identity = MultiReduceOp::identity())
-      : data{container, identity}
-  { }
+      : data {container, identity}
+  {}
 
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduce(BaseMultiReduce const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduce(BaseMultiReduce &&) = default;
-  BaseMultiReduce &operator=(BaseMultiReduce const&) = delete;
-  BaseMultiReduce &operator=(BaseMultiReduce &&) = delete;
+  BaseMultiReduce(BaseMultiReduce&&)                 = default;
+  BaseMultiReduce& operator=(BaseMultiReduce const&) = delete;
+  BaseMultiReduce& operator=(BaseMultiReduce&&)      = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduce() = default;
 
@@ -108,13 +110,14 @@ struct BaseMultiReduce
     reset(RepeatView<value_type>(init_val, num_bins), identity);
   }
 
-  template < typename Container,
-             concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr >
+  template <typename Container,
+            concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr>
   void reset(Container const& container,
              value_type identity = MultiReduceOp::identity())
   {
-    for (size_t bin = 0; bin < data.num_bins(); ++bin) {
-      RAJA_UNUSED_VAR(get(bin)); // automatic get() before reset
+    for (size_t bin = 0; bin < data.num_bins(); ++bin)
+    {
+      RAJA_UNUSED_VAR(get(bin));  // automatic get() before reset
     }
     data.reset(container, identity);
   }
@@ -125,7 +128,7 @@ struct BaseMultiReduce
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  BaseMultiReduce const& combine(size_t bin, value_type const &other) const
+  BaseMultiReduce const& combine(size_t bin, value_type const& other) const
   {
     data.combine(bin, other);
     return *this;
@@ -135,16 +138,19 @@ struct BaseMultiReduce
   value_type get(size_t bin) const { return data.get(bin); }
 
   //! Get the calculated reduced value for each bin and store it in container
-  template < typename Container,
-             concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr >
+  template <typename Container,
+            concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr>
   void get_all(Container& container) const
   {
     RAJA_EXTRACT_BED_IT(container);
-    if (size_t(distance_it) != data.num_bins()) {
-      RAJA_ABORT_OR_THROW("MultiReduce::get_all container has different size than multi reducer");
+    if (size_t(distance_it) != data.num_bins())
+    {
+      RAJA_ABORT_OR_THROW("MultiReduce::get_all container has different size "
+                          "than multi reducer");
     }
     size_t bin = 0;
-    for (auto& val : container) {
+    for (auto& val : container)
+    {
       val = data.get(bin);
       ++bin;
     }
@@ -167,17 +173,17 @@ class BaseMultiReduceMin : public BaseMultiReduce<MultiReduceData>
 {
 public:
   using Base = BaseMultiReduce<MultiReduceData>;
-  using typename Base::value_type;
   using Base::Base;
+  using typename Base::value_type;
 
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceMin(BaseMultiReduceMin const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceMin(BaseMultiReduceMin &&) = default;
+  BaseMultiReduceMin(BaseMultiReduceMin&&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceMin &operator=(BaseMultiReduceMin const&) = delete;
+  BaseMultiReduceMin& operator=(BaseMultiReduceMin const&) = delete;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceMin &operator=(BaseMultiReduceMin &&) = delete;
+  BaseMultiReduceMin& operator=(BaseMultiReduceMin&&) = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceMin() = default;
 
@@ -185,8 +191,8 @@ class BaseMultiReduceMin : public BaseMultiReduce<MultiReduceData>
   {
     RAJA_HOST_DEVICE
     reference(BaseMultiReduceMin const& base, size_t bin)
-      : m_base(base), m_bin(bin)
-    { }
+        : m_base(base), m_bin(bin)
+    {}
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
@@ -196,10 +202,7 @@ class BaseMultiReduceMin : public BaseMultiReduce<MultiReduceData>
       return *this;
     }
 
-    value_type get() const
-    {
-      return m_base.get(m_bin);
-    }
+    value_type get() const { return m_base.get(m_bin); }
 
   private:
     BaseMultiReduceMin const& m_base;
@@ -226,9 +229,9 @@ class BaseMultiReduceMax : public BaseMultiReduce<MultiReduceData>
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceMax(BaseMultiReduceMax const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceMax(BaseMultiReduceMax &&) = default;
-  BaseMultiReduceMax &operator=(BaseMultiReduceMax const&) = delete;
-  BaseMultiReduceMax &operator=(BaseMultiReduceMax &&) = delete;
+  BaseMultiReduceMax(BaseMultiReduceMax&&)                 = default;
+  BaseMultiReduceMax& operator=(BaseMultiReduceMax const&) = delete;
+  BaseMultiReduceMax& operator=(BaseMultiReduceMax&&)      = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceMax() = default;
 
@@ -236,8 +239,8 @@ class BaseMultiReduceMax : public BaseMultiReduce<MultiReduceData>
   {
     RAJA_HOST_DEVICE
     reference(BaseMultiReduceMax const& base, size_t bin)
-      : m_base(base), m_bin(bin)
-    { }
+        : m_base(base), m_bin(bin)
+    {}
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
@@ -247,10 +250,7 @@ class BaseMultiReduceMax : public BaseMultiReduce<MultiReduceData>
       return *this;
     }
 
-    value_type get() const
-    {
-      return m_base.get(m_bin);
-    }
+    value_type get() const { return m_base.get(m_bin); }
 
   private:
     BaseMultiReduceMax const& m_base;
@@ -277,9 +277,9 @@ class BaseMultiReduceSum : public BaseMultiReduce<MultiReduceData>
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceSum(BaseMultiReduceSum const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceSum(BaseMultiReduceSum &&) = default;
-  BaseMultiReduceSum &operator=(BaseMultiReduceSum const&) = delete;
-  BaseMultiReduceSum &operator=(BaseMultiReduceSum &&) = delete;
+  BaseMultiReduceSum(BaseMultiReduceSum&&)                 = default;
+  BaseMultiReduceSum& operator=(BaseMultiReduceSum const&) = delete;
+  BaseMultiReduceSum& operator=(BaseMultiReduceSum&&)      = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceSum() = default;
 
@@ -287,8 +287,8 @@ class BaseMultiReduceSum : public BaseMultiReduce<MultiReduceData>
   {
     RAJA_HOST_DEVICE
     reference(BaseMultiReduceSum const& base, size_t bin)
-      : m_base(base), m_bin(bin)
-    { }
+        : m_base(base), m_bin(bin)
+    {}
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
@@ -298,10 +298,7 @@ class BaseMultiReduceSum : public BaseMultiReduce<MultiReduceData>
       return *this;
     }
 
-    value_type get() const
-    {
-      return m_base.get(m_bin);
-    }
+    value_type get() const { return m_base.get(m_bin); }
 
   private:
     BaseMultiReduceSum const& m_base;
@@ -328,9 +325,9 @@ class BaseMultiReduceBitOr : public BaseMultiReduce<MultiReduceData>
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceBitOr(BaseMultiReduceBitOr const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceBitOr(BaseMultiReduceBitOr &&) = default;
-  BaseMultiReduceBitOr &operator=(BaseMultiReduceBitOr const&) = delete;
-  BaseMultiReduceBitOr &operator=(BaseMultiReduceBitOr &&) = delete;
+  BaseMultiReduceBitOr(BaseMultiReduceBitOr&&)                 = default;
+  BaseMultiReduceBitOr& operator=(BaseMultiReduceBitOr const&) = delete;
+  BaseMultiReduceBitOr& operator=(BaseMultiReduceBitOr&&)      = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceBitOr() = default;
 
@@ -338,8 +335,8 @@ class BaseMultiReduceBitOr : public BaseMultiReduce<MultiReduceData>
   {
     RAJA_HOST_DEVICE
     reference(BaseMultiReduceBitOr const& base, size_t bin)
-      : m_base(base), m_bin(bin)
-    { }
+        : m_base(base), m_bin(bin)
+    {}
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
@@ -349,10 +346,7 @@ class BaseMultiReduceBitOr : public BaseMultiReduce<MultiReduceData>
       return *this;
     }
 
-    value_type get() const
-    {
-      return m_base.get(m_bin);
-    }
+    value_type get() const { return m_base.get(m_bin); }
 
   private:
     BaseMultiReduceBitOr const& m_base;
@@ -379,9 +373,9 @@ class BaseMultiReduceBitAnd : public BaseMultiReduce<MultiReduceData>
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceBitAnd(BaseMultiReduceBitAnd const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceBitAnd(BaseMultiReduceBitAnd &&) = default;
-  BaseMultiReduceBitAnd &operator=(BaseMultiReduceBitAnd const&) = delete;
-  BaseMultiReduceBitAnd &operator=(BaseMultiReduceBitAnd &&) = delete;
+  BaseMultiReduceBitAnd(BaseMultiReduceBitAnd&&)                 = default;
+  BaseMultiReduceBitAnd& operator=(BaseMultiReduceBitAnd const&) = delete;
+  BaseMultiReduceBitAnd& operator=(BaseMultiReduceBitAnd&&)      = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceBitAnd() = default;
 
@@ -389,8 +383,8 @@ class BaseMultiReduceBitAnd : public BaseMultiReduce<MultiReduceData>
   {
     RAJA_HOST_DEVICE
     reference(BaseMultiReduceBitAnd const& base, size_t bin)
-      : m_base(base), m_bin(bin)
-    { }
+        : m_base(base), m_bin(bin)
+    {}
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
@@ -400,10 +394,7 @@ class BaseMultiReduceBitAnd : public BaseMultiReduce<MultiReduceData>
       return *this;
     }
 
-    value_type get() const
-    {
-      return m_base.get(m_bin);
-    }
+    value_type get() const { return m_base.get(m_bin); }
 
   private:
     BaseMultiReduceBitAnd const& m_base;
diff --git a/include/RAJA/pattern/detail/privatizer.hpp b/include/RAJA/pattern/detail/privatizer.hpp
index 3579027cd3..9ca50f308b 100644
--- a/include/RAJA/pattern/detail/privatizer.hpp
+++ b/include/RAJA/pattern/detail/privatizer.hpp
@@ -30,7 +30,7 @@ class has_privatizer
 private:
   template <typename C>
   static auto Test(void*)
-      -> decltype(camp::val<typename C::privatizer>(), camp::true_type{});
+      -> decltype(camp::val<typename C::privatizer>(), camp::true_type {});
 
   template <typename>
   static camp::false_type Test(...);
@@ -42,12 +42,13 @@ class has_privatizer
 
 static_assert(!has_privatizer<int>::value, "if this fires, abandon all hope");
 
-struct GenericWrapperBase {
-};
+struct GenericWrapperBase
+{};
 
 template <typename T>
-struct Privatizer {
-  using value_type = camp::decay<T>;
+struct Privatizer
+{
+  using value_type     = camp::decay<T>;
   using reference_type = value_type&;
   value_type priv;
   static_assert(!has_privatizer<T>::value,
@@ -58,7 +59,7 @@ struct Privatizer {
                 "a bug");
 
   RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE Privatizer(const T& o) : priv{o} {}
+  RAJA_HOST_DEVICE Privatizer(const T& o) : priv {o} {}
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE reference_type get_priv() { return priv; }
@@ -85,7 +86,7 @@ template <typename T,
           typename std::enable_if<!has_privatizer<T>::value>::type* = nullptr>
 RAJA_HOST_DEVICE auto thread_privatize(const T& item) -> Privatizer<T>
 {
-  return Privatizer<T>{item};
+  return Privatizer<T> {item};
 }
 
 RAJA_SUPPRESS_HD_WARN
@@ -93,7 +94,7 @@ template <typename T,
           typename std::enable_if<has_privatizer<T>::value>::type* = nullptr>
 RAJA_HOST_DEVICE auto thread_privatize(const T& item) -> typename T::privatizer
 {
-  return typename T::privatizer{item};
+  return typename T::privatizer {item};
 }
 
 }  // namespace internal
diff --git a/include/RAJA/pattern/detail/reduce.hpp b/include/RAJA/pattern/detail/reduce.hpp
index 788f3c698d..2f826b590f 100644
--- a/include/RAJA/pattern/detail/reduce.hpp
+++ b/include/RAJA/pattern/detail/reduce.hpp
@@ -21,33 +21,33 @@
 #include "RAJA/util/Operators.hpp"
 #include "RAJA/util/types.hpp"
 
-#define RAJA_DECLARE_REDUCER(OP, POL, COMBINER)               \
-  template <typename T>                                       \
-  class Reduce##OP<POL, T>                                    \
-      : public reduce::detail::BaseReduce##OP<T, COMBINER>    \
-  {                                                           \
-  public:                                                     \
-    using Base = reduce::detail::BaseReduce##OP<T, COMBINER>; \
-    using Base::Base;                                         \
+#define RAJA_DECLARE_REDUCER(OP, POL, COMBINER)                                \
+  template <typename T>                                                        \
+  class Reduce##OP<POL, T>                                                     \
+      : public reduce::detail::BaseReduce##OP<T, COMBINER>                     \
+  {                                                                            \
+  public:                                                                      \
+    using Base = reduce::detail::BaseReduce##OP<T, COMBINER>;                  \
+    using Base::Base;                                                          \
   };
 
-#define RAJA_DECLARE_INDEX_REDUCER(OP, POL, COMBINER)                    \
-  template <typename T, typename IndexType>                              \
-  class Reduce##OP<POL, T, IndexType>                                    \
-      : public reduce::detail::BaseReduce##OP<T, IndexType, COMBINER>    \
-  {                                                                      \
-  public:                                                                \
-    using Base = reduce::detail::BaseReduce##OP<T, IndexType, COMBINER>; \
-    using Base::Base;                                                    \
+#define RAJA_DECLARE_INDEX_REDUCER(OP, POL, COMBINER)                          \
+  template <typename T, typename IndexType>                                    \
+  class Reduce##OP<POL, T, IndexType>                                          \
+      : public reduce::detail::BaseReduce##OP<T, IndexType, COMBINER>          \
+  {                                                                            \
+  public:                                                                      \
+    using Base = reduce::detail::BaseReduce##OP<T, IndexType, COMBINER>;       \
+    using Base::Base;                                                          \
   };
 
-#define RAJA_DECLARE_ALL_REDUCERS(POL, COMBINER)       \
-  RAJA_DECLARE_REDUCER(Sum, POL, COMBINER)             \
-  RAJA_DECLARE_REDUCER(Min, POL, COMBINER)             \
-  RAJA_DECLARE_REDUCER(Max, POL, COMBINER)             \
-  RAJA_DECLARE_INDEX_REDUCER(MinLoc, POL, COMBINER)    \
-  RAJA_DECLARE_INDEX_REDUCER(MaxLoc, POL, COMBINER)    \
-  RAJA_DECLARE_REDUCER(BitOr, POL, COMBINER)           \
+#define RAJA_DECLARE_ALL_REDUCERS(POL, COMBINER)                               \
+  RAJA_DECLARE_REDUCER(Sum, POL, COMBINER)                                     \
+  RAJA_DECLARE_REDUCER(Min, POL, COMBINER)                                     \
+  RAJA_DECLARE_REDUCER(Max, POL, COMBINER)                                     \
+  RAJA_DECLARE_INDEX_REDUCER(MinLoc, POL, COMBINER)                            \
+  RAJA_DECLARE_INDEX_REDUCER(MaxLoc, POL, COMBINER)                            \
+  RAJA_DECLARE_REDUCER(BitOr, POL, COMBINER)                                   \
   RAJA_DECLARE_REDUCER(BitAnd, POL, COMBINER)
 
 namespace RAJA
@@ -64,14 +64,15 @@ namespace detail
 {
 
 template <typename T, template <typename...> class Op>
-struct op_adapter : private Op<T, T, T> {
+struct op_adapter : private Op<T, T, T>
+{
   using operator_type = Op<T, T, T>;
   RAJA_HOST_DEVICE static constexpr T identity()
   {
     return operator_type::identity();
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val, const T v) const
+  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T& val, const T v) const
   {
     val = operator_type::operator()(val, v);
   }
@@ -79,24 +80,24 @@ struct op_adapter : private Op<T, T, T> {
 }  // namespace detail
 
 template <typename T>
-struct sum : detail::op_adapter<T, RAJA::operators::plus> {
-};
+struct sum : detail::op_adapter<T, RAJA::operators::plus>
+{};
 
 template <typename T>
-struct min : detail::op_adapter<T, RAJA::operators::minimum> {
-};
+struct min : detail::op_adapter<T, RAJA::operators::minimum>
+{};
 
 template <typename T>
-struct max : detail::op_adapter<T, RAJA::operators::maximum> {
-};
+struct max : detail::op_adapter<T, RAJA::operators::maximum>
+{};
 
 template <typename T>
-struct or_bit : detail::op_adapter<T, RAJA::operators::bit_or> {
-};
+struct or_bit : detail::op_adapter<T, RAJA::operators::bit_or>
+{};
 
 template <typename T>
-struct and_bit : detail::op_adapter<T, RAJA::operators::bit_and> {
-};
+struct and_bit : detail::op_adapter<T, RAJA::operators::bit_and>
+{};
 
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
@@ -107,7 +108,8 @@ namespace detail
 {
 
 template <typename T, bool = std::is_integral<T>::value>
-struct DefaultLoc {};
+struct DefaultLoc
+{};
 
 template <typename T>
 struct DefaultLoc<T, false>  // any non-integral type
@@ -128,30 +130,39 @@ class ValueLoc
   T val = doing_min ? operators::limits<T>::max() : operators::limits<T>::min();
   IndexType loc = DefaultLoc<IndexType>().value();
 
-#if __NVCC__ && defined(CUDART_VERSION) && CUDART_VERSION < 9020 || defined(__HIPCC__)
+#if __NVCC__ && defined(CUDART_VERSION) && CUDART_VERSION < 9020 ||            \
+    defined(__HIPCC__)
   RAJA_HOST_DEVICE constexpr ValueLoc() {}
-  RAJA_HOST_DEVICE constexpr ValueLoc(ValueLoc const &other) : val{other.val}, loc{other.loc} {}
+  RAJA_HOST_DEVICE constexpr ValueLoc(ValueLoc const& other)
+      : val {other.val}, loc {other.loc}
+  {}
   RAJA_HOST_DEVICE
-  ValueLoc &operator=(ValueLoc const &other) { val = other.val; loc = other.loc; return *this;}
+  ValueLoc& operator=(ValueLoc const& other)
+  {
+    val = other.val;
+    loc = other.loc;
+    return *this;
+  }
 #else
-  constexpr ValueLoc() = default;
-  constexpr ValueLoc(ValueLoc const &) = default;
-  ValueLoc &operator=(ValueLoc const &) = default;
+  constexpr ValueLoc()                 = default;
+  constexpr ValueLoc(ValueLoc const&)  = default;
+  ValueLoc& operator=(ValueLoc const&) = default;
 #endif
 
-  RAJA_HOST_DEVICE constexpr ValueLoc(T const &val_) : val{val_}, loc{DefaultLoc<IndexType>().value()} {}
-  RAJA_HOST_DEVICE constexpr ValueLoc(T const &val_, IndexType const &loc_)
-      : val{val_}, loc{loc_}
-  {
-  }
+  RAJA_HOST_DEVICE constexpr ValueLoc(T const& val_)
+      : val {val_}, loc {DefaultLoc<IndexType>().value()}
+  {}
+  RAJA_HOST_DEVICE constexpr ValueLoc(T const& val_, IndexType const& loc_)
+      : val {val_}, loc {loc_}
+  {}
 
   RAJA_HOST_DEVICE operator T() const { return val; }
   RAJA_HOST_DEVICE IndexType getLoc() { return loc; }
-  RAJA_HOST_DEVICE bool operator<(ValueLoc const &rhs) const
+  RAJA_HOST_DEVICE bool operator<(ValueLoc const& rhs) const
   {
     return val < rhs.val;
   }
-  RAJA_HOST_DEVICE bool operator>(ValueLoc const &rhs) const
+  RAJA_HOST_DEVICE bool operator>(ValueLoc const& rhs) const
   {
     return val > rhs.val;
   }
@@ -164,14 +175,17 @@ class ValueLoc
 namespace operators
 {
 template <typename T, typename IndexType, bool B>
-struct limits<::RAJA::reduce::detail::ValueLoc<T, IndexType, B>> {
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr
-  ::RAJA::reduce::detail::ValueLoc<T, IndexType, B> min()
+struct limits<::RAJA::reduce::detail::ValueLoc<T, IndexType, B>>
+{
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr ::RAJA::reduce::detail::
+      ValueLoc<T, IndexType, B>
+      min()
   {
     return ::RAJA::reduce::detail::ValueLoc<T, IndexType, B>(limits<T>::min());
   }
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr
-  ::RAJA::reduce::detail::ValueLoc<T, IndexType, B> max()
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr ::RAJA::reduce::detail::
+      ValueLoc<T, IndexType, B>
+      max()
   {
     return ::RAJA::reduce::detail::ValueLoc<T, IndexType, B>(limits<T>::max());
   }
@@ -197,50 +211,49 @@ class BaseReduce
   Combiner_t mutable c;
 
 public:
-  using value_type = T;
+  using value_type  = T;
   using reduce_type = Reduce;
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  BaseReduce() : c{T(), Reduce::identity()} {}
+  BaseReduce() : c {T(), Reduce::identity()} {}
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   BaseReduce(T init_val, T identity_ = Reduce::identity())
-      : c{init_val, identity_}
-  {
-  }
+      : c {init_val, identity_}
+  {}
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   void reset(T val, T identity_ = Reduce::identity())
   {
-    operator T(); // automatic get() before reset
+    operator T();  // automatic get() before reset
     c.reset(val, identity_);
   }
 
   //! prohibit compiler-generated copy assignment
-  BaseReduce &operator=(const BaseReduce &) = delete;
+  BaseReduce& operator=(const BaseReduce&) = delete;
 
   //! compiler-generated copy constructor
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  BaseReduce(const BaseReduce &copy) : c(copy.c) {}
+  BaseReduce(const BaseReduce& copy) : c(copy.c) {}
 
   //! compiler-generated move constructor
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  BaseReduce(BaseReduce &&copy) : c(std::move(copy.c)) {}
+  BaseReduce(BaseReduce&& copy) : c(std::move(copy.c)) {}
 
   //! compiler-generated move assignment
-  BaseReduce &operator=(BaseReduce &&) = default;
+  BaseReduce& operator=(BaseReduce&&) = default;
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  void combine(T const &other) const { c.combine(other); }
+  void combine(T const& other) const { c.combine(other); }
 
-  T &local() const { return c.local(); }
+  T& local() const { return c.local(); }
 
   //! Get the calculated reduced value
   operator T() const { return c.get(); }
@@ -253,51 +266,50 @@ template <typename T, typename Reduce, typename Derived>
 class BaseCombinable
 {
 protected:
-  BaseCombinable const *parent = nullptr;
+  BaseCombinable const* parent = nullptr;
   T identity;
   T mutable my_data;
 
 public:
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  constexpr BaseCombinable() : identity{T()}, my_data{T()} {}
+  constexpr BaseCombinable() : identity {T()}, my_data {T()} {}
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   constexpr BaseCombinable(T init_val, T identity_ = T())
-      : identity{identity_}, my_data{init_val}
-  {
-  }
+      : identity {identity_}, my_data {init_val}
+  {}
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   void reset(T init_val, T identity_)
   {
-    my_data = init_val;
+    my_data  = init_val;
     identity = identity_;
   }
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  constexpr BaseCombinable(BaseCombinable const &other)
-      : parent{other.parent ? other.parent : &other},
-        identity{other.identity},
-        my_data{identity}
-  {
-  }
+  constexpr BaseCombinable(BaseCombinable const& other)
+      : parent {other.parent ? other.parent : &other},
+        identity {other.identity},
+        my_data {identity}
+  {}
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   ~BaseCombinable()
   {
-    if (parent && my_data != identity) {
+    if (parent && my_data != identity)
+    {
       Reduce()(parent->my_data, my_data);
     }
   }
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  void combine(T const &other) { Reduce{}(my_data, other); }
+  void combine(T const& other) { Reduce {}(my_data, other); }
 
   /*!
    *  \return the calculated reduced value
@@ -307,17 +319,17 @@ class BaseCombinable
   /*!
    *  \return reference to the local value
    */
-  T &local() const { return my_data; }
+  T& local() const { return my_data; }
 
   T get_combined() const { return my_data; }
 
 private:
   // Convenience method for CRTP
-  const Derived &derived() const
+  const Derived& derived() const
   {
-    return *(static_cast<const Derived *>(this));
+    return *(static_cast<const Derived*>(this));
   }
-  Derived &derived() { return *(static_cast<Derived *>(this)); }
+  Derived& derived() { return *(static_cast<Derived*>(this)); }
 };
 
 /*!
@@ -336,7 +348,7 @@ class BaseReduceMin : public BaseReduce<T, RAJA::reduce::min, Combiner>
 
   //! reducer function; updates the current instance's state
   RAJA_HOST_DEVICE
-  const BaseReduceMin &min(T rhs) const
+  const BaseReduceMin& min(T rhs) const
   {
     this->combine(rhs);
     return *this;
@@ -350,36 +362,43 @@ class BaseReduceMin : public BaseReduce<T, RAJA::reduce::min, Combiner>
  *
  **************************************************************************
  */
-template <typename T, typename IndexType, template <typename, typename> class Combiner>
+template <typename T,
+          typename IndexType,
+          template <typename, typename>
+          class Combiner>
 class BaseReduceMinLoc
     : public BaseReduce<ValueLoc<T, IndexType>, RAJA::reduce::min, Combiner>
 {
 public:
   using Base = BaseReduce<ValueLoc<T, IndexType>, RAJA::reduce::min, Combiner>;
-  using value_type = typename Base::value_type;
+  using value_type  = typename Base::value_type;
   using reduce_type = typename Base::reduce_type;
   using Base::Base;
 
   constexpr BaseReduceMinLoc() : Base(value_type(T(), IndexType())) {}
 
-  constexpr BaseReduceMinLoc(T init_val, IndexType init_idx,
-                             T identity_val_ = reduce_type::identity(),
-                             IndexType identity_loc_ = DefaultLoc<IndexType>().value())
-    : Base(value_type(init_val, init_idx), value_type(identity_val_, identity_loc_))
-  {
-  }
-
-  void reset(T init_val, IndexType init_idx,
-             T identity_val_ = reduce_type::identity(),
+  constexpr BaseReduceMinLoc(
+      T init_val,
+      IndexType init_idx,
+      T identity_val_         = reduce_type::identity(),
+      IndexType identity_loc_ = DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val_, identity_loc_))
+  {}
+
+  void reset(T init_val,
+             IndexType init_idx,
+             T identity_val_         = reduce_type::identity(),
              IndexType identity_loc_ = DefaultLoc<IndexType>().value())
   {
-    operator T(); // automatic get() before reset
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val_, identity_loc_));
+    operator T();  // automatic get() before reset
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val_, identity_loc_));
   }
 
   /// \brief reducer function; updates the current instance's state
   RAJA_HOST_DEVICE
-  const BaseReduceMinLoc &minloc(T rhs, IndexType loc) const
+  const BaseReduceMinLoc& minloc(T rhs, IndexType loc) const
   {
     this->combine(value_type(rhs, loc));
     return *this;
@@ -408,7 +427,7 @@ class BaseReduceMax : public BaseReduce<T, RAJA::reduce::max, Combiner>
 
   //! reducer function; updates the current instance's state
   RAJA_HOST_DEVICE
-  const BaseReduceMax &max(T rhs) const
+  const BaseReduceMax& max(T rhs) const
   {
     this->combine(rhs);
     return *this;
@@ -432,7 +451,7 @@ class BaseReduceSum : public BaseReduce<T, RAJA::reduce::sum, Combiner>
   //! reducer function; updates the current instance's state
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  const BaseReduceSum &operator+=(T rhs) const
+  const BaseReduceSum& operator+=(T rhs) const
   {
     this->combine(rhs);
     return *this;
@@ -456,7 +475,7 @@ class BaseReduceBitOr : public BaseReduce<T, RAJA::reduce::or_bit, Combiner>
   //! reducer function; updates the current instance's state
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  const BaseReduceBitOr &operator|=(T rhs) const
+  const BaseReduceBitOr& operator|=(T rhs) const
   {
     this->combine(rhs);
     return *this;
@@ -480,7 +499,7 @@ class BaseReduceBitAnd : public BaseReduce<T, RAJA::reduce::and_bit, Combiner>
   //! reducer function; updates the current instance's state
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  const BaseReduceBitAnd &operator&=(T rhs) const
+  const BaseReduceBitAnd& operator&=(T rhs) const
   {
     this->combine(rhs);
     return *this;
@@ -495,36 +514,45 @@ class BaseReduceBitAnd : public BaseReduce<T, RAJA::reduce::and_bit, Combiner>
  *
  **************************************************************************
  */
-template <typename T, typename IndexType, template <typename, typename> class Combiner>
-class BaseReduceMaxLoc
-    : public BaseReduce<ValueLoc<T, IndexType, false>, RAJA::reduce::max, Combiner>
+template <typename T,
+          typename IndexType,
+          template <typename, typename>
+          class Combiner>
+class BaseReduceMaxLoc : public BaseReduce<ValueLoc<T, IndexType, false>,
+                                           RAJA::reduce::max,
+                                           Combiner>
 {
 public:
-  using Base = BaseReduce<ValueLoc<T, IndexType, false>, RAJA::reduce::max, Combiner>;
-  using value_type = typename Base::value_type;
+  using Base =
+      BaseReduce<ValueLoc<T, IndexType, false>, RAJA::reduce::max, Combiner>;
+  using value_type  = typename Base::value_type;
   using reduce_type = typename Base::reduce_type;
   using Base::Base;
 
   constexpr BaseReduceMaxLoc() : Base(value_type(T(), IndexType())) {}
 
-  constexpr BaseReduceMaxLoc(T init_val, IndexType init_idx,
-                             T identity_val_ = reduce_type::identity(),
-                             IndexType identity_loc_ = DefaultLoc<IndexType>().value())
-    : Base(value_type(init_val, init_idx), value_type(identity_val_, identity_loc_))
-  {
-  }
-
-  void reset(T init_val, IndexType init_idx,
-             T identity_val_ = reduce_type::identity(),
+  constexpr BaseReduceMaxLoc(
+      T init_val,
+      IndexType init_idx,
+      T identity_val_         = reduce_type::identity(),
+      IndexType identity_loc_ = DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val_, identity_loc_))
+  {}
+
+  void reset(T init_val,
+             IndexType init_idx,
+             T identity_val_         = reduce_type::identity(),
              IndexType identity_loc_ = DefaultLoc<IndexType>().value())
   {
-    operator T(); // automatic get() before reset
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val_, identity_loc_));
+    operator T();  // automatic get() before reset
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val_, identity_loc_));
   }
 
   //! reducer function; updates the current instance's state
   RAJA_HOST_DEVICE
-  const BaseReduceMaxLoc &maxloc(T rhs, IndexType loc) const
+  const BaseReduceMaxLoc& maxloc(T rhs, IndexType loc) const
   {
     this->combine(value_type(rhs, loc));
     return *this;
diff --git a/include/RAJA/pattern/forall.hpp b/include/RAJA/pattern/forall.hpp
index 686f0e8c6b..e0b87a5d60 100644
--- a/include/RAJA/pattern/forall.hpp
+++ b/include/RAJA/pattern/forall.hpp
@@ -98,14 +98,15 @@ namespace detail
 {
 /// Adapter to replace specific implementations for the icount variants
 template <typename Range, typename Body, typename IndexT>
-struct icount_adapter {
+struct icount_adapter
+{
   using index_type = typename std::decay<IndexT>::type;
   typename std::decay<Body>::type body;
   using container_type = typename std::decay<Range>::type;
   typename container_type::iterator begin_it;
   Index_type icount;
   icount_adapter(Range const& r, Body const& b, IndexT icount_)
-      : body{b}, icount{icount_}
+      : body {b}, icount {icount_}
   {
     using std::begin;
     begin_it = begin(r);
@@ -119,16 +120,28 @@ struct icount_adapter {
   }
 };
 
-struct CallForall {
-  template <typename T, typename ExecPol, typename Body, typename Res, typename ForallParams>
-  RAJA_INLINE camp::resources::EventProxy<Res> operator()(T const&, ExecPol, Body, Res, ForallParams) const;
+struct CallForall
+{
+  template <typename T,
+            typename ExecPol,
+            typename Body,
+            typename Res,
+            typename ForallParams>
+  RAJA_INLINE camp::resources::EventProxy<Res>
+  operator()(T const&, ExecPol, Body, Res, ForallParams) const;
 };
 
-struct CallForallIcount {
+struct CallForallIcount
+{
   constexpr CallForallIcount(int s);
 
-  template <typename T, typename ExecPol, typename Body, typename Res, typename ForallParams>
-  RAJA_INLINE camp::resources::EventProxy<Res> operator()(T const&, ExecPol, Body, Res, ForallParams) const;
+  template <typename T,
+            typename ExecPol,
+            typename Body,
+            typename Res,
+            typename ForallParams>
+  RAJA_INLINE camp::resources::EventProxy<Res>
+  operator()(T const&, ExecPol, Body, Res, ForallParams) const;
 
   const int start;
 };
@@ -152,22 +165,31 @@ namespace wrap
  *
  ******************************************************************************
  */
-template <typename Res, typename ExecutionPolicy, typename Container, typename LoopBody, typename ForallParams>
+template <typename Res,
+          typename ExecutionPolicy,
+          typename Container,
+          typename LoopBody,
+          typename ForallParams>
 RAJA_INLINE concepts::enable_if_t<
     RAJA::resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
     type_traits::is_range<Container>>
-forall(Res r, ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body, ForallParams&& f_params)
+forall(Res r,
+       ExecutionPolicy&& p,
+       Container&& c,
+       LoopBody&& loop_body,
+       ForallParams&& f_params)
 {
   RAJA_FORCEINLINE_RECURSIVE
-  return forall_impl(r,
-                     std::forward<ExecutionPolicy>(p),
-                     std::forward<Container>(c),
-                     std::forward<LoopBody>(loop_body),
-                     std::forward<ForallParams>(f_params));
+  return forall_impl(
+      r, std::forward<ExecutionPolicy>(p), std::forward<Container>(c),
+      std::forward<LoopBody>(loop_body), std::forward<ForallParams>(f_params));
 }
 
-template <typename Res, typename ExecutionPolicy, typename Container, typename LoopBody>
+template <typename Res,
+          typename ExecutionPolicy,
+          typename Container,
+          typename LoopBody>
 RAJA_INLINE concepts::enable_if_t<
     RAJA::resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
@@ -175,11 +197,9 @@ RAJA_INLINE concepts::enable_if_t<
 forall(Res r, ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
 {
   RAJA_FORCEINLINE_RECURSIVE
-  return forall_impl(r,
-                     std::forward<ExecutionPolicy>(p),
-                     std::forward<Container>(c),
-                     std::forward<LoopBody>(loop_body),
-                     expt::get_empty_forall_param_pack());
+  return forall_impl(
+      r, std::forward<ExecutionPolicy>(p), std::forward<Container>(c),
+      std::forward<LoopBody>(loop_body), expt::get_empty_forall_param_pack());
 }
 
 
@@ -197,22 +217,22 @@ template <typename Res,
           typename LoopBody,
           typename ForallParams>
 RAJA_INLINE resources::EventProxy<Res> forall_Icount(Res r,
-                                                      ExecutionPolicy&& p,
-                                                      Container&& c,
-                                                      IndexType&& icount,
-                                                      LoopBody&& loop_body,
-                                                      ForallParams&& f_params)
+                                                     ExecutionPolicy&& p,
+                                                     Container&& c,
+                                                     IndexType&& icount,
+                                                     LoopBody&& loop_body,
+                                                     ForallParams&& f_params)
 {
   using std::begin;
   using std::distance;
   using std::end;
   auto range = RangeSegment(0, distance(begin(c), end(c)));
-  detail::icount_adapter<Container, LoopBody, IndexType> adapted(c,
-                                                                 loop_body,
+  detail::icount_adapter<Container, LoopBody, IndexType> adapted(c, loop_body,
                                                                  icount);
   using policy::sequential::forall_impl;
   RAJA_FORCEINLINE_RECURSIVE
-  return forall_impl(r, std::forward<ExecutionPolicy>(p), range, adapted, std::forward<ForallParams>(f_params));
+  return forall_impl(r, std::forward<ExecutionPolicy>(p), range, adapted,
+                     std::forward<ForallParams>(f_params));
 }
 
 /*!
@@ -230,23 +250,24 @@ template <typename Res,
           typename... SegmentTypes,
           typename LoopBody,
           typename ForallParams>
-RAJA_INLINE resources::EventProxy<Res> forall_Icount(Res r,
-                                                ExecPolicy<SegmentIterPolicy,
-                                                SegmentExecPolicy>,
-                                                const TypedIndexSet<SegmentTypes...>& iset,
-                                                LoopBody loop_body,
-                                                ForallParams f_params)
+RAJA_INLINE resources::EventProxy<Res>
+forall_Icount(Res r,
+              ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
+              const TypedIndexSet<SegmentTypes...>& iset,
+              LoopBody loop_body,
+              ForallParams f_params)
 {
   // no need for icount variant here
-  auto segIterRes = resources::get_resource<SegmentIterPolicy>::type::get_default();
-  wrap::forall(segIterRes, SegmentIterPolicy(), iset, [=, &r](int segID) {
-    iset.segmentCall(segID,
+  auto segIterRes =
+      resources::get_resource<SegmentIterPolicy>::type::get_default();
+  wrap::forall(segIterRes, SegmentIterPolicy(), iset,
+               [=, &r](int segID)
+               {
+                 iset.segmentCall(
+                     segID,
                      detail::CallForallIcount(iset.getStartingIcount(segID)),
-                     SegmentExecPolicy(),
-                     loop_body,
-                     r,
-                     f_params);
-  });
+                     SegmentExecPolicy(), loop_body, r, f_params);
+               });
   return RAJA::resources::EventProxy<Res>(r);
 }
 
@@ -256,30 +277,33 @@ template <typename Res,
           typename LoopBody,
           typename... SegmentTypes,
           typename ForallParams>
-RAJA_INLINE resources::EventProxy<Res> forall(Res r,
-                                         ExecPolicy<SegmentIterPolicy,
-                                         SegmentExecPolicy>,
-                                         const TypedIndexSet<SegmentTypes...>& iset,
-                                         LoopBody loop_body,
-                                         ForallParams f_params)
-{
-  auto segIterRes = resources::get_resource<SegmentIterPolicy>::type::get_default();
-  wrap::forall(segIterRes, SegmentIterPolicy(), iset, [=, &r](int segID) {
-    iset.segmentCall(segID, detail::CallForall{}, SegmentExecPolicy(), loop_body, r, f_params);
-  });
+RAJA_INLINE resources::EventProxy<Res>
+forall(Res r,
+       ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
+       const TypedIndexSet<SegmentTypes...>& iset,
+       LoopBody loop_body,
+       ForallParams f_params)
+{
+  auto segIterRes =
+      resources::get_resource<SegmentIterPolicy>::type::get_default();
+  wrap::forall(segIterRes, SegmentIterPolicy(), iset,
+               [=, &r](int segID)
+               {
+                 iset.segmentCall(segID, detail::CallForall {},
+                                  SegmentExecPolicy(), loop_body, r, f_params);
+               });
   return RAJA::resources::EventProxy<Res>(r);
 }
 
 }  // end namespace wrap
 
 
-
 /*!
  ******************************************************************************
  *
- * \brief The RAJA::policy_by_value_interface forall functions provide an interface with
- *        value-based policies. It also enforces the interface and performs
- *        static checks as well as triggering plugins and loop body updates.
+ * \brief The RAJA::policy_by_value_interface forall functions provide an
+ *interface with value-based policies. It also enforces the interface and
+ *performs static checks as well as triggering plugins and loop body updates.
  *
  ******************************************************************************
  */
@@ -294,11 +318,12 @@ inline namespace policy_by_value_interface
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy, typename Res, typename IdxSet, typename... Params>
-RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
-                                                     Res r,
-                                                     IdxSet&& c,
-                                                     Params&&... params)
+template <typename ExecutionPolicy,
+          typename Res,
+          typename IdxSet,
+          typename... Params>
+RAJA_INLINE resources::EventProxy<Res>
+forall_Icount(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
 {
   static_assert(type_traits::is_index_set<IdxSet>::value,
                 "Expected a TypedIndexSet but did not get one. Are you using "
@@ -306,9 +331,10 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
 
   auto f_params = expt::make_forall_param_pack(std::forward<Params>(params)...);
   auto&& loop_body = expt::get_lambda(std::forward<Params>(params)...);
-  //expt::check_forall_optional_args(loop_body, f_params);
+  // expt::check_forall_optional_args(loop_body, f_params);
 
-  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
+  util::PluginContext context {
+      util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -318,27 +344,24 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
 
   util::callPreLaunchPlugins(context);
 
-  RAJA::resources::EventProxy<Res> e = wrap::forall_Icount(
-      r,
-      std::forward<ExecutionPolicy>(p),
-      std::forward<IdxSet>(c),
-      std::move(body),
-      f_params);
+  RAJA::resources::EventProxy<Res> e =
+      wrap::forall_Icount(r, std::forward<ExecutionPolicy>(p),
+                          std::forward<IdxSet>(c), std::move(body), f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
 }
-template <typename ExecutionPolicy, typename IdxSet, typename LoopBody,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
-RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
-                                                     IdxSet&& c,
-                                                     LoopBody&& loop_body)
+template <
+    typename ExecutionPolicy,
+    typename IdxSet,
+    typename LoopBody,
+    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
+RAJA_INLINE resources::EventProxy<Res>
+forall_Icount(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall_Icount(
-      std::forward<ExecutionPolicy>(p),
-      r,
-      std::forward<IdxSet>(c),
+      std::forward<ExecutionPolicy>(p), r, std::forward<IdxSet>(c),
       std::forward<LoopBody>(loop_body));
 }
 
@@ -349,11 +372,14 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy, typename Res, typename IdxSet, typename... Params>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_indexset_policy<ExecutionPolicy>>
-forall(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
+template <typename ExecutionPolicy,
+          typename Res,
+          typename IdxSet,
+          typename... Params>
+RAJA_INLINE
+    concepts::enable_if_t<resources::EventProxy<Res>,
+                          type_traits::is_indexset_policy<ExecutionPolicy>>
+    forall(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
 {
   static_assert(type_traits::is_index_set<IdxSet>::value,
                 "Expected a TypedIndexSet but did not get one. Are you using "
@@ -363,7 +389,8 @@ forall(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
   auto&& loop_body = expt::get_lambda(std::forward<Params>(params)...);
   expt::check_forall_optional_args(loop_body, f_params);
 
-  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
+  util::PluginContext context {
+      util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -373,28 +400,26 @@ forall(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
 
   util::callPreLaunchPlugins(context);
 
-  resources::EventProxy<Res> e = wrap::forall(
-      r,
-      std::forward<ExecutionPolicy>(p),
-      std::forward<IdxSet>(c),
-      std::move(body),
-      f_params);
+  resources::EventProxy<Res> e =
+      wrap::forall(r, std::forward<ExecutionPolicy>(p), std::forward<IdxSet>(c),
+                   std::move(body), f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
 }
-template <typename ExecutionPolicy, typename IdxSet, typename LoopBody,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_indexset_policy<ExecutionPolicy>>
-forall(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
+template <
+    typename ExecutionPolicy,
+    typename IdxSet,
+    typename LoopBody,
+    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
+RAJA_INLINE
+    concepts::enable_if_t<resources::EventProxy<Res>,
+                          type_traits::is_indexset_policy<ExecutionPolicy>>
+    forall(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall(
-      std::forward<ExecutionPolicy>(p),
-      r,
-      std::forward<IdxSet>(c),
+      std::forward<ExecutionPolicy>(p), r, std::forward<IdxSet>(c),
       std::forward<LoopBody>(loop_body));
 }
 
@@ -405,12 +430,14 @@ forall(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy, typename Container, typename LoopBody,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_multi_policy<ExecutionPolicy>,
-    type_traits::is_range<Container>>
+template <
+    typename ExecutionPolicy,
+    typename Container,
+    typename LoopBody,
+    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_multi_policy<ExecutionPolicy>,
+                                  type_traits::is_range<Container>>
 forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
 {
   static_assert(type_traits::is_random_access_range<Container>::value,
@@ -419,10 +446,9 @@ forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
   auto r = Res::get_default();
 
   // plugins handled in multipolicy policy_invoker
-  return forall_impl(r,
-              std::forward<ExecutionPolicy>(p),
-              std::forward<Container>(c),
-              std::forward<LoopBody>(loop_body));
+  return forall_impl(r, std::forward<ExecutionPolicy>(p),
+                     std::forward<Container>(c),
+                     std::forward<LoopBody>(loop_body));
 }
 
 /*!
@@ -438,10 +464,9 @@ template <typename ExecutionPolicy,
           typename IndexType,
           typename FirstParam,
           typename... Params>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_range<Container>,
-    type_traits::is_integral<IndexType>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_range<Container>,
+                                  type_traits::is_integral<IndexType>>
 forall_Icount(ExecutionPolicy&& p,
               Res r,
               Container&& c,
@@ -452,11 +477,14 @@ forall_Icount(ExecutionPolicy&& p,
   static_assert(type_traits::is_random_access_range<Container>::value,
                 "Container does not model RandomAccessIterator");
 
-  auto f_params = expt::make_forall_param_pack(std::forward<FirstParam>(first), std::forward<Params>(params)...);
-  auto&& loop_body = expt::get_lambda(std::forward<FirstParam>(first), std::forward<Params>(params)...);
-  //expt::check_forall_optional_args(loop_body, f_params);
+  auto f_params = expt::make_forall_param_pack(std::forward<FirstParam>(first),
+                                               std::forward<Params>(params)...);
+  auto&& loop_body = expt::get_lambda(std::forward<FirstParam>(first),
+                                      std::forward<Params>(params)...);
+  // expt::check_forall_optional_args(loop_body, f_params);
 
-  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
+  util::PluginContext context {
+      util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -467,21 +495,18 @@ forall_Icount(ExecutionPolicy&& p,
   util::callPreLaunchPlugins(context);
 
   resources::EventProxy<Res> e = wrap::forall_Icount(
-      r,
-      std::forward<ExecutionPolicy>(p),
-      std::forward<Container>(c),
-      icount,
-      std::move(body),
-      f_params);
+      r, std::forward<ExecutionPolicy>(p), std::forward<Container>(c), icount,
+      std::move(body), f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
 }
-template <typename ExecutionPolicy,
-          typename Container,
-          typename IndexType,
-          typename LoopBody,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+template <
+    typename ExecutionPolicy,
+    typename Container,
+    typename IndexType,
+    typename LoopBody,
+    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     type_traits::is_range<Container>,
@@ -494,10 +519,7 @@ forall_Icount(ExecutionPolicy&& p,
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall_Icount(
-      std::forward<ExecutionPolicy>(p),
-      r,
-      std::forward<Container>(c),
-      icount,
+      std::forward<ExecutionPolicy>(p), r, std::forward<Container>(c), icount,
       std::forward<LoopBody>(loop_body));
 }
 
@@ -509,7 +531,10 @@ forall_Icount(ExecutionPolicy&& p,
  ******************************************************************************
  */
 
-template <typename ExecutionPolicy, typename Res, typename Container, typename... Params>
+template <typename ExecutionPolicy,
+          typename Res,
+          typename Container,
+          typename... Params>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
@@ -524,7 +549,8 @@ forall(ExecutionPolicy&& p, Res r, Container&& c, Params&&... params)
   auto&& loop_body = expt::get_lambda(std::forward<Params>(params)...);
   expt::check_forall_optional_args(loop_body, f_params);
 
-  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
+  util::PluginContext context {
+      util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -534,19 +560,19 @@ forall(ExecutionPolicy&& p, Res r, Container&& c, Params&&... params)
 
   util::callPreLaunchPlugins(context);
 
-  resources::EventProxy<Res> e =  wrap::forall(
-      r,
-      std::forward<ExecutionPolicy>(p),
-      std::forward<Container>(c),
-      std::move(body),
-      f_params);
+  resources::EventProxy<Res> e =
+      wrap::forall(r, std::forward<ExecutionPolicy>(p),
+                   std::forward<Container>(c), std::move(body), f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
 }
 
-template <typename ExecutionPolicy, typename Container, typename LoopBody,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+template <
+    typename ExecutionPolicy,
+    typename Container,
+    typename LoopBody,
+    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
@@ -556,13 +582,11 @@ forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall(
-      std::forward<ExecutionPolicy>(p),
-      r,
-      std::forward<Container>(c),
+      std::forward<ExecutionPolicy>(p), r, std::forward<Container>(c),
       std::forward<LoopBody>(loop_body));
 }
 
-}  // end inline namespace policy_by_value_interface
+}  // namespace policy_by_value_interface
 
 
 /*!
@@ -570,20 +594,23 @@ forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecutionPolicy, typename... Args,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+template <
+    typename ExecutionPolicy,
+    typename... Args,
+    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
 RAJA_INLINE resources::EventProxy<Res> forall(Args&&... args)
 {
   Res r = Res::get_default();
-  return ::RAJA::policy_by_value_interface::forall(
-      ExecutionPolicy(), r, std::forward<Args>(args)...);
+  return ::RAJA::policy_by_value_interface::forall(ExecutionPolicy(), r,
+                                                   std::forward<Args>(args)...);
 }
 template <typename ExecutionPolicy, typename Res, typename... Args>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>, type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_resource<Res>>
 forall(Res r, Args&&... args)
 {
-  return ::RAJA::policy_by_value_interface::forall(
-      ExecutionPolicy(), r, std::forward<Args>(args)...);
+  return ::RAJA::policy_by_value_interface::forall(ExecutionPolicy(), r,
+                                                   std::forward<Args>(args)...);
 }
 
 /*!
@@ -592,8 +619,10 @@ forall(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecutionPolicy, typename... Args,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+template <
+    typename ExecutionPolicy,
+    typename... Args,
+    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
 RAJA_INLINE resources::EventProxy<Res> forall_Icount(Args&&... args)
 {
   Res r = Res::get_default();
@@ -601,7 +630,8 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(Args&&... args)
       ExecutionPolicy(), r, std::forward<Args>(args)...);
 }
 template <typename ExecutionPolicy, typename Res, typename... Args>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>, type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_resource<Res>>
 forall_Icount(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::forall_Icount(
@@ -611,12 +641,17 @@ forall_Icount(Res r, Args&&... args)
 namespace detail
 {
 
-template <typename T, typename ExecutionPolicy, typename LoopBody, typename Res, typename ForallParams>
-RAJA_INLINE camp::resources::EventProxy<Res> CallForall::operator()(T const& segment,
-                                                               ExecutionPolicy,
-                                                               LoopBody body,
-                                                               Res r,
-                                                               ForallParams f_params) const
+template <typename T,
+          typename ExecutionPolicy,
+          typename LoopBody,
+          typename Res,
+          typename ForallParams>
+RAJA_INLINE camp::resources::EventProxy<Res>
+CallForall::operator()(T const& segment,
+                       ExecutionPolicy,
+                       LoopBody body,
+                       Res r,
+                       ForallParams f_params) const
 {
   // this is only called inside a region, use impl
   using policy::sequential::forall_impl;
@@ -626,15 +661,21 @@ RAJA_INLINE camp::resources::EventProxy<Res> CallForall::operator()(T const& seg
 
 constexpr CallForallIcount::CallForallIcount(int s) : start(s) {}
 
-template <typename T, typename ExecutionPolicy, typename LoopBody, typename Res, typename ForallParams>
-RAJA_INLINE camp::resources::EventProxy<Res> CallForallIcount::operator()(T const& segment,
-                                                                     ExecutionPolicy,
-                                                                     LoopBody body,
-                                                                     Res r,
-                                                                     ForallParams f_params) const
+template <typename T,
+          typename ExecutionPolicy,
+          typename LoopBody,
+          typename Res,
+          typename ForallParams>
+RAJA_INLINE camp::resources::EventProxy<Res>
+CallForallIcount::operator()(T const& segment,
+                             ExecutionPolicy,
+                             LoopBody body,
+                             Res r,
+                             ForallParams f_params) const
 {
   // go through wrap to unwrap icount
-  return wrap::forall_Icount(r, ExecutionPolicy(), segment, start, body, f_params);
+  return wrap::forall_Icount(r, ExecutionPolicy(), segment, start, body,
+                             f_params);
 }
 
 }  // namespace detail
@@ -650,100 +691,112 @@ RAJA_INLINE camp::resources::EventProxy<Res> CallForallIcount::operator()(T cons
 namespace expt
 {
 
-  template<camp::idx_t IDX, typename POLICY_LIST>
-  struct dynamic_helper
+template <camp::idx_t IDX, typename POLICY_LIST>
+struct dynamic_helper
+{
+  template <typename SEGMENT, typename BODY>
+  static void invoke_forall(const int pol, SEGMENT const& seg, BODY const& body)
   {
-    template<typename SEGMENT, typename BODY>
-    static void invoke_forall(const int pol, SEGMENT const &seg, BODY const &body)
+    if (IDX == pol)
     {
-      if(IDX==pol){
-        using t_pol = typename camp::at<POLICY_LIST,camp::num<IDX>>::type;
-        RAJA::forall<t_pol>(seg, body);
-        return;
-      }
-      dynamic_helper<IDX-1, POLICY_LIST>::invoke_forall(pol, seg, body);
+      using t_pol = typename camp::at<POLICY_LIST, camp::num<IDX>>::type;
+      RAJA::forall<t_pol>(seg, body);
+      return;
     }
+    dynamic_helper<IDX - 1, POLICY_LIST>::invoke_forall(pol, seg, body);
+  }
 
-    template<typename SEGMENT, typename BODY>
-    static resources::EventProxy<resources::Resource>
-    invoke_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, BODY const &body)
-    {
-
-      using t_pol = typename camp::at<POLICY_LIST,camp::num<IDX>>::type;
-      using resource_type = typename resources::get_resource<t_pol>::type;
+  template <typename SEGMENT, typename BODY>
+  static resources::EventProxy<resources::Resource>
+  invoke_forall(RAJA::resources::Resource r,
+                const int pol,
+                SEGMENT const& seg,
+                BODY const& body)
+  {
 
-      if(IDX==pol){
-        RAJA::forall<t_pol>(r.get<resource_type>(), seg, body);
+    using t_pol         = typename camp::at<POLICY_LIST, camp::num<IDX>>::type;
+    using resource_type = typename resources::get_resource<t_pol>::type;
 
-        //Return a generic event proxy from r,
-        //because forall returns a typed event proxy
-        return {r};
-      }
+    if (IDX == pol)
+    {
+      RAJA::forall<t_pol>(r.get<resource_type>(), seg, body);
 
-      return dynamic_helper<IDX-1, POLICY_LIST>::invoke_forall(r, pol, seg, body);
+      // Return a generic event proxy from r,
+      // because forall returns a typed event proxy
+      return {r};
     }
 
-  };
+    return dynamic_helper<IDX - 1, POLICY_LIST>::invoke_forall(r, pol, seg,
+                                                               body);
+  }
+};
 
-  template<typename POLICY_LIST>
-  struct dynamic_helper<0, POLICY_LIST>
+template <typename POLICY_LIST>
+struct dynamic_helper<0, POLICY_LIST>
+{
+  template <typename SEGMENT, typename BODY>
+  static void invoke_forall(const int pol, SEGMENT const& seg, BODY const& body)
   {
-    template<typename SEGMENT, typename BODY>
-    static void
-    invoke_forall(const int pol, SEGMENT const &seg, BODY const &body)
+    if (0 == pol)
     {
-      if(0==pol){
-        using t_pol = typename camp::at<POLICY_LIST,camp::num<0>>::type;
-        RAJA::forall<t_pol>(seg, body);
-        return;
-      }
-      RAJA_ABORT_OR_THROW("Policy enum not supported ");
+      using t_pol = typename camp::at<POLICY_LIST, camp::num<0>>::type;
+      RAJA::forall<t_pol>(seg, body);
+      return;
     }
+    RAJA_ABORT_OR_THROW("Policy enum not supported ");
+  }
 
-    template<typename SEGMENT, typename BODY>
-    static resources::EventProxy<resources::Resource>
-    invoke_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, BODY const &body)
-    {
-      if(pol != 0) RAJA_ABORT_OR_THROW("Policy value out of range ");
+  template <typename SEGMENT, typename BODY>
+  static resources::EventProxy<resources::Resource>
+  invoke_forall(RAJA::resources::Resource r,
+                const int pol,
+                SEGMENT const& seg,
+                BODY const& body)
+  {
+    if (pol != 0) RAJA_ABORT_OR_THROW("Policy value out of range ");
 
-      using t_pol = typename camp::at<POLICY_LIST,camp::num<0>>::type;
-      using resource_type = typename resources::get_resource<t_pol>::type;
+    using t_pol         = typename camp::at<POLICY_LIST, camp::num<0>>::type;
+    using resource_type = typename resources::get_resource<t_pol>::type;
 
-      RAJA::forall<t_pol>(r.get<resource_type>(), seg, body);
+    RAJA::forall<t_pol>(r.get<resource_type>(), seg, body);
 
-      //Return a generic event proxy from r,
-      //because forall returns a typed event proxy
-      return {r};
-    }
+    // Return a generic event proxy from r,
+    // because forall returns a typed event proxy
+    return {r};
+  }
+};
 
-  };
+template <typename POLICY_LIST, typename SEGMENT, typename BODY>
+void dynamic_forall(const int pol, SEGMENT const& seg, BODY const& body)
+{
+  constexpr int N = camp::size<POLICY_LIST>::value;
+  static_assert(N > 0, "RAJA policy list must not be empty");
 
-  template<typename POLICY_LIST, typename SEGMENT, typename BODY>
-  void dynamic_forall(const int pol, SEGMENT const &seg, BODY const &body)
+  if (pol > N - 1)
   {
-    constexpr int N = camp::size<POLICY_LIST>::value;
-    static_assert(N > 0, "RAJA policy list must not be empty");
-
-    if(pol > N-1)  {
-      RAJA_ABORT_OR_THROW("Policy enum not supported");
-    }
-    dynamic_helper<N-1, POLICY_LIST>::invoke_forall(pol, seg, body);
+    RAJA_ABORT_OR_THROW("Policy enum not supported");
   }
+  dynamic_helper<N - 1, POLICY_LIST>::invoke_forall(pol, seg, body);
+}
 
-  template<typename POLICY_LIST, typename SEGMENT, typename BODY>
-  resources::EventProxy<resources::Resource>
-  dynamic_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, BODY const &body)
-  {
-    constexpr int N = camp::size<POLICY_LIST>::value;
-    static_assert(N > 0, "RAJA policy list must not be empty");
-
-    if(pol > N-1)  {
-      RAJA_ABORT_OR_THROW("Policy value out of range");
-    }
+template <typename POLICY_LIST, typename SEGMENT, typename BODY>
+resources::EventProxy<resources::Resource>
+dynamic_forall(RAJA::resources::Resource r,
+               const int pol,
+               SEGMENT const& seg,
+               BODY const& body)
+{
+  constexpr int N = camp::size<POLICY_LIST>::value;
+  static_assert(N > 0, "RAJA policy list must not be empty");
 
-    return dynamic_helper<N-1, POLICY_LIST>::invoke_forall(r, pol, seg, body);
+  if (pol > N - 1)
+  {
+    RAJA_ABORT_OR_THROW("Policy value out of range");
   }
 
+  return dynamic_helper<N - 1, POLICY_LIST>::invoke_forall(r, pol, seg, body);
+}
+
 }  // namespace expt
 
 
diff --git a/include/RAJA/pattern/kernel.hpp b/include/RAJA/pattern/kernel.hpp
index 1875fe27d9..d03c8f531f 100644
--- a/include/RAJA/pattern/kernel.hpp
+++ b/include/RAJA/pattern/kernel.hpp
@@ -55,44 +55,43 @@ template <typename T>
 struct IterableWrapperTuple;
 
 template <typename... Ts>
-struct IterableWrapperTuple<camp::tuple<Ts...>> {
+struct IterableWrapperTuple<camp::tuple<Ts...>>
+{
 
-  using type =
-      camp::tuple<RAJA::Span<typename camp::decay<Ts>::iterator,
-                             typename camp::decay<Ts>::IndexType>...>;
+  using type = camp::tuple<RAJA::Span<typename camp::decay<Ts>::iterator,
+                                      typename camp::decay<Ts>::IndexType>...>;
 };
 
 
 namespace internal
 {
 template <class Tuple, camp::idx_t... I>
-RAJA_INLINE constexpr auto make_wrapped_tuple_impl(Tuple &&t,
-                                                   camp::idx_seq<I...>)
-    -> camp::tuple<RAJA::Span<
-        typename camp::decay<
-            camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
-        typename camp::decay<
-            camp::tuple_element_t<I, camp::decay<Tuple>>>::IndexType>...>
+RAJA_INLINE constexpr auto
+make_wrapped_tuple_impl(Tuple&& t, camp::idx_seq<I...>) -> camp::tuple<
+    RAJA::Span<typename camp::decay<
+                   camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
+               typename camp::decay<
+                   camp::tuple_element_t<I, camp::decay<Tuple>>>::IndexType>...>
 {
   return camp::make_tuple(
-      RAJA::Span<
-          typename camp::decay<
-              camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
-          typename camp::decay<camp::tuple_element_t<I, camp::decay<Tuple>>>::
-              IndexType>{camp::get<I>(std::forward<Tuple>(t)).begin(),
-                         camp::get<I>(std::forward<Tuple>(t)).end()}...);
+      RAJA::Span<typename camp::decay<
+                     camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
+                 typename camp::decay<
+                     camp::tuple_element_t<I, camp::decay<Tuple>>>::IndexType> {
+          camp::get<I>(std::forward<Tuple>(t)).begin(),
+          camp::get<I>(std::forward<Tuple>(t)).end()}...);
 }
 }  // namespace internal
 
 template <class Tuple>
-RAJA_INLINE constexpr auto make_wrapped_tuple(Tuple &&t)
+RAJA_INLINE constexpr auto make_wrapped_tuple(Tuple&& t)
     -> decltype(internal::make_wrapped_tuple_impl(
         std::forward<Tuple>(t),
-        camp::make_idx_seq_t<camp::tuple_size<camp::decay<Tuple>>::value>{}))
+        camp::make_idx_seq_t<camp::tuple_size<camp::decay<Tuple>>::value> {}))
 {
   return internal::make_wrapped_tuple_impl(
       std::forward<Tuple>(t),
-      camp::make_idx_seq_t<camp::tuple_size<camp::decay<Tuple>>::value>{});
+      camp::make_idx_seq_t<camp::tuple_size<camp::decay<Tuple>>::value> {});
 }
 
 
@@ -101,12 +100,13 @@ template <typename PolicyType,
           typename ParamTuple,
           typename Resource,
           typename... Bodies>
-RAJA_INLINE resources::EventProxy<Resource> kernel_param_resource(SegmentTuple &&segments,
-                                                                  ParamTuple &&params,
-                                                                  Resource resource,
-                                                                  Bodies &&... bodies)
+RAJA_INLINE resources::EventProxy<Resource>
+kernel_param_resource(SegmentTuple&& segments,
+                      ParamTuple&& params,
+                      Resource resource,
+                      Bodies&&... bodies)
 {
-  util::PluginContext context{util::make_context<PolicyType>()};
+  util::PluginContext context {util::make_context<PolicyType>()};
 
   // TODO: test that all policy members model the Executor policy concept
   // TODO: add a static_assert for functors which cannot be invoked with
@@ -119,10 +119,8 @@ RAJA_INLINE resources::EventProxy<Resource> kernel_param_resource(SegmentTuple &
 
   using param_tuple_t = camp::decay<ParamTuple>;
 
-  using loop_data_t = internal::LoopData<segment_tuple_t,
-                                         param_tuple_t,
-                                         Resource,
-                                         camp::decay<Bodies>...>;
+  using loop_data_t = internal::LoopData<segment_tuple_t, param_tuple_t,
+                                         Resource, camp::decay<Bodies>...>;
 
 
   util::callPreCapturePlugins(context);
@@ -131,11 +129,10 @@ RAJA_INLINE resources::EventProxy<Resource> kernel_param_resource(SegmentTuple &
   // our segments, loop bodies, and the tuple of loop indices
   // it is passed through all of the kernel mechanics by-referenece,
   // and only copied to provide thread-private instances.
-  loop_data_t loop_data(make_wrapped_tuple(
-                            std::forward<SegmentTuple>(segments)),
-                            std::forward<ParamTuple>(params),
-                            resource,
-                            std::forward<Bodies>(bodies)...);
+  loop_data_t loop_data(
+      make_wrapped_tuple(std::forward<SegmentTuple>(segments)),
+      std::forward<ParamTuple>(params), resource,
+      std::forward<Bodies>(bodies)...);
 
   util::callPostCapturePlugins(context);
 
@@ -156,40 +153,35 @@ template <typename PolicyType,
           typename SegmentTuple,
           typename Resource,
           typename... Bodies>
-RAJA_INLINE resources::EventProxy<Resource> kernel_resource(SegmentTuple &&segments,
-                                                            Resource resource,
-                                                            Bodies &&... bodies)
+RAJA_INLINE resources::EventProxy<Resource>
+kernel_resource(SegmentTuple&& segments, Resource resource, Bodies&&... bodies)
 {
-  return RAJA::kernel_param_resource<PolicyType>(std::forward<SegmentTuple>(segments),
-                                                 RAJA::make_tuple(),
-                                                 resource,
-                                                 std::forward<Bodies>(bodies)...);
+  return RAJA::kernel_param_resource<PolicyType>(
+      std::forward<SegmentTuple>(segments), RAJA::make_tuple(), resource,
+      std::forward<Bodies>(bodies)...);
 }
 
 template <typename PolicyType,
           typename SegmentTuple,
           typename ParamTuple,
           typename... Bodies>
-RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>> kernel_param(SegmentTuple &&segments,
-                                                                                           ParamTuple &&params,
-                                                                                           Bodies &&... bodies)
+RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>>
+kernel_param(SegmentTuple&& segments, ParamTuple&& params, Bodies&&... bodies)
 {
   auto res = resources::get_default_resource<PolicyType>();
-  return RAJA::kernel_param_resource<PolicyType>(std::forward<SegmentTuple>(segments),
-                                                 std::forward<ParamTuple>(params),
-                                                 res,
-                                                 std::forward<Bodies>(bodies)...);
+  return RAJA::kernel_param_resource<PolicyType>(
+      std::forward<SegmentTuple>(segments), std::forward<ParamTuple>(params),
+      res, std::forward<Bodies>(bodies)...);
 }
 
 template <typename PolicyType, typename SegmentTuple, typename... Bodies>
-RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>> kernel(SegmentTuple &&segments,
-                                                                                     Bodies &&... bodies)
+RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>>
+kernel(SegmentTuple&& segments, Bodies&&... bodies)
 {
   auto res = resources::get_default_resource<PolicyType>();
-  return RAJA::kernel_param_resource<PolicyType>(std::forward<SegmentTuple>(segments),
-                                                 RAJA::make_tuple(),
-                                                 res,
-                                                 std::forward<Bodies>(bodies)...);
+  return RAJA::kernel_param_resource<PolicyType>(
+      std::forward<SegmentTuple>(segments), RAJA::make_tuple(), res,
+      std::forward<Bodies>(bodies)...);
 }
 
 
diff --git a/include/RAJA/pattern/kernel/Collapse.hpp b/include/RAJA/pattern/kernel/Collapse.hpp
index 8efb126397..10afccda53 100644
--- a/include/RAJA/pattern/kernel/Collapse.hpp
+++ b/include/RAJA/pattern/kernel/Collapse.hpp
@@ -29,8 +29,8 @@ namespace statement
 template <typename ExecPolicy, typename ForList, typename... EnclosedStmts>
 struct Collapse : public internal::ForList,
                   public internal::CollapseBase,
-                  public internal::Statement<ExecPolicy, EnclosedStmts...> {
-};
+                  public internal::Statement<ExecPolicy, EnclosedStmts...>
+{};
 
 
 }  // namespace statement
diff --git a/include/RAJA/pattern/kernel/Conditional.hpp b/include/RAJA/pattern/kernel/Conditional.hpp
index 6b7875c4c2..1b8f38f76b 100644
--- a/include/RAJA/pattern/kernel/Conditional.hpp
+++ b/include/RAJA/pattern/kernel/Conditional.hpp
@@ -37,8 +37,8 @@ namespace statement
  *
  */
 template <typename Condition, typename... EnclosedStmts>
-struct If : public internal::Statement<camp::nil, EnclosedStmts...> {
-};
+struct If : public internal::Statement<camp::nil, EnclosedStmts...>
+{};
 
 
 /*!
@@ -46,10 +46,11 @@ struct If : public internal::Statement<camp::nil, EnclosedStmts...> {
  *
  */
 template <long value>
-struct Value {
+struct Value
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static long eval(Data const &)
+  RAJA_HOST_DEVICE RAJA_INLINE static long eval(Data const&)
   {
     return value;
   }
@@ -60,10 +61,11 @@ struct Value {
  *
  */
 template <typename L, typename R>
-struct Equals {
+struct Equals
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) == R::eval(data);
   }
@@ -74,10 +76,11 @@ struct Equals {
  *
  */
 template <typename L, typename R>
-struct NotEquals {
+struct NotEquals
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) != R::eval(data);
   }
@@ -89,10 +92,11 @@ struct NotEquals {
  *
  */
 template <typename L, typename R>
-struct Or {
+struct Or
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) || R::eval(data);
   }
@@ -104,10 +108,11 @@ struct Or {
  *
  */
 template <typename L, typename R>
-struct And {
+struct And
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) && R::eval(data);
   }
@@ -119,10 +124,11 @@ struct And {
  *
  */
 template <typename L, typename R>
-struct LessThan {
+struct LessThan
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) < R::eval(data);
   }
@@ -134,10 +140,11 @@ struct LessThan {
  *
  */
 template <typename L, typename R>
-struct LessThanEq {
+struct LessThanEq
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) <= R::eval(data);
   }
@@ -149,10 +156,11 @@ struct LessThanEq {
  *
  */
 template <typename L, typename R>
-struct GreaterThan {
+struct GreaterThan
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) > R::eval(data);
   }
@@ -164,10 +172,11 @@ struct GreaterThan {
  *
  */
 template <typename L, typename R>
-struct GreaterThanEq {
+struct GreaterThanEq
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) >= R::eval(data);
   }
@@ -179,10 +188,11 @@ struct GreaterThanEq {
  *
  */
 template <typename L>
-struct Not {
+struct Not
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return !(L::eval(data));
   }
@@ -196,14 +206,16 @@ namespace internal
 
 
 template <typename Condition, typename... EnclosedStmts, typename Types>
-struct StatementExecutor<statement::If<Condition, EnclosedStmts...>, Types> {
+struct StatementExecutor<statement::If<Condition, EnclosedStmts...>, Types>
+{
 
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE void exec(Data&& data)
   {
 
-    if (Condition::eval(data)) {
+    if (Condition::eval(data))
+    {
       execute_statement_list<camp::list<EnclosedStmts...>, Types>(
           std::forward<Data>(data));
     }
diff --git a/include/RAJA/pattern/kernel/For.hpp b/include/RAJA/pattern/kernel/For.hpp
index 539c451673..661fe92868 100644
--- a/include/RAJA/pattern/kernel/For.hpp
+++ b/include/RAJA/pattern/kernel/For.hpp
@@ -42,7 +42,8 @@ template <camp::idx_t ArgumentId,
           typename... EnclosedStmts>
 struct For : public internal::ForList,
              public internal::ForTraitBase<ArgumentId, ExecPolicy>,
-             public internal::Statement<ExecPolicy, EnclosedStmts...> {
+             public internal::Statement<ExecPolicy, EnclosedStmts...>
+{
 
   // TODO: add static_assert for valid policy in Pol
   using execution_policy_t = ExecPolicy;
@@ -59,8 +60,12 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  *
  */
-template <camp::idx_t ArgumentId, typename Data, typename Types, typename... EnclosedStmts>
-struct ForWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
+template <camp::idx_t ArgumentId,
+          typename Data,
+          typename Types,
+          typename... EnclosedStmts>
+struct ForWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
+{
 
   using Base = GenericWrapper<Data, Types, EnclosedStmts...>;
   using Base::Base;
@@ -85,11 +90,13 @@ template <camp::idx_t ArgumentId,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    statement::For<ArgumentId, ExecPolicy, EnclosedStmts...>, Types> {
+    statement::For<ArgumentId, ExecPolicy, EnclosedStmts...>,
+    Types>
+{
 
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE void exec(Data&& data)
   {
 
     // Set the argument type for this loop
@@ -98,12 +105,13 @@ struct StatementExecutor<
     // Create a wrapper, just in case forall_impl needs to thread_privatize
     ForWrapper<ArgumentId, Data, NewTypes, EnclosedStmts...> for_wrapper(data);
 
-    auto len = segment_length<ArgumentId>(data);
+    auto len    = segment_length<ArgumentId>(data);
     using len_t = decltype(len);
 
     auto r = data.res;
 
-    forall_impl(r, ExecPolicy{}, TypedRangeSegment<len_t>(0, len), for_wrapper, RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, ExecPolicy {}, TypedRangeSegment<len_t>(0, len), for_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
@@ -112,15 +120,14 @@ struct StatementExecutor<
  *
  *
  */
-template <camp::idx_t ArgumentId,
-          typename... EnclosedStmts,
-          typename Types>
-struct StatementExecutor<
-    statement::For<ArgumentId, seq_exec, EnclosedStmts...>, Types> {
+template <camp::idx_t ArgumentId, typename... EnclosedStmts, typename Types>
+struct StatementExecutor<statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
+                         Types>
+{
 
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE void exec(Data&& data)
   {
 
     // Set the argument type for this loop
@@ -129,12 +136,13 @@ struct StatementExecutor<
     // Create a wrapper, just in case forall_impl needs to thread_privatize
     ForWrapper<ArgumentId, Data, NewTypes, EnclosedStmts...> for_wrapper(data);
 
-    auto len = segment_length<ArgumentId>(data);
+    auto len    = segment_length<ArgumentId>(data);
     using len_t = decltype(len);
 
     RAJA_EXTRACT_BED_IT(TypedRangeSegment<len_t>(0, len));
 
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    for (decltype(distance_it) i = 0; i < distance_it; ++i)
+    {
       for_wrapper(*(begin_it + i));
     }
   }
diff --git a/include/RAJA/pattern/kernel/ForICount.hpp b/include/RAJA/pattern/kernel/ForICount.hpp
index 18515c7f59..c6e75c35aa 100644
--- a/include/RAJA/pattern/kernel/ForICount.hpp
+++ b/include/RAJA/pattern/kernel/ForICount.hpp
@@ -44,8 +44,9 @@ template <camp::idx_t ArgumentId,
           typename ExecPolicy = camp::nil,
           typename... EnclosedStmts>
 struct ForICount : public internal::ForList,
-             public internal::ForTraitBase<ArgumentId, ExecPolicy>,
-             public internal::Statement<ExecPolicy, EnclosedStmts...> {
+                   public internal::ForTraitBase<ArgumentId, ExecPolicy>,
+                   public internal::Statement<ExecPolicy, EnclosedStmts...>
+{
 
   static_assert(std::is_base_of<internal::ParamBase, ParamId>::value,
                 "Inappropriate ParamId, ParamId must be of type "
@@ -64,9 +65,13 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <camp::idx_t ArgumentId, typename ParamId, typename Data, typename Types,
+template <camp::idx_t ArgumentId,
+          typename ParamId,
+          typename Data,
+          typename Types,
           typename... EnclosedStmts>
-struct ForICountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
+struct ForICountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
+{
 
   using Base = GenericWrapper<Data, Types, EnclosedStmts...>;
   using Base::Base;
@@ -93,26 +98,29 @@ template <camp::idx_t ArgumentId,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    statement::ForICount<ArgumentId, ParamId, ExecPolicy, EnclosedStmts...>, Types> {
+    statement::ForICount<ArgumentId, ParamId, ExecPolicy, EnclosedStmts...>,
+    Types>
+{
 
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE void exec(Data&& data)
   {
 
     // Set the argument type for this loop
     using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
     // Create a wrapper, just in case forall_impl needs to thread_privatize
-    ForICountWrapper<ArgumentId, ParamId, Data, NewTypes,
-                     EnclosedStmts...> for_wrapper(data);
+    ForICountWrapper<ArgumentId, ParamId, Data, NewTypes, EnclosedStmts...>
+        for_wrapper(data);
 
-    auto len = segment_length<ArgumentId>(data);
+    auto len    = segment_length<ArgumentId>(data);
     using len_t = decltype(len);
 
     auto r = resources::get_resource<ExecPolicy>::type::get_default();
 
-    forall_impl(r, ExecPolicy{}, TypedRangeSegment<len_t>(0, len), for_wrapper, RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, ExecPolicy {}, TypedRangeSegment<len_t>(0, len), for_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
diff --git a/include/RAJA/pattern/kernel/Hyperplane.hpp b/include/RAJA/pattern/kernel/Hyperplane.hpp
index 955afcecc0..66be036556 100644
--- a/include/RAJA/pattern/kernel/Hyperplane.hpp
+++ b/include/RAJA/pattern/kernel/Hyperplane.hpp
@@ -81,10 +81,8 @@ template <camp::idx_t HpArgumentId,
           typename ArgList,
           typename ExecPolicy,
           typename... EnclosedStmts>
-struct Hyperplane
-    : public internal::Statement<ExecPolicy,
-                                 EnclosedStmts...> {
-};
+struct Hyperplane : public internal::Statement<ExecPolicy, EnclosedStmts...>
+{};
 
 }  // end namespace statement
 
@@ -93,9 +91,8 @@ namespace internal
 
 
 template <camp::idx_t HpArgumentId, typename ArgList, typename... EnclosedStmts>
-struct HyperplaneInner
-    : public internal::Statement<camp::nil, EnclosedStmts...> {
-};
+struct HyperplaneInner : public internal::Statement<camp::nil, EnclosedStmts...>
+{};
 
 
 template <camp::idx_t HpArgumentId,
@@ -108,11 +105,13 @@ struct StatementExecutor<statement::Hyperplane<HpArgumentId,
                                                HpExecPolicy,
                                                ArgList<Args...>,
                                                ExecPolicy,
-                                               EnclosedStmts...>, Types> {
+                                               EnclosedStmts...>,
+                         Types>
+{
 
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &data)
+  static RAJA_INLINE void exec(Data& data)
   {
 
     // get type of Hp arguments index
@@ -126,8 +125,7 @@ struct StatementExecutor<statement::Hyperplane<HpArgumentId,
     // Add a Collapse policy around our enclosed statements that will handle
     // the inner hyperplane loop's execution
     using kernel_policy = statement::Collapse<
-        ExecPolicy,
-        ArgList<Args...>,
+        ExecPolicy, ArgList<Args...>,
         HyperplaneInner<HpArgumentId, ArgList<Args...>, EnclosedStmts...>>;
 
     // Create a For-loop wrapper for the outer loop
@@ -135,9 +133,9 @@ struct StatementExecutor<statement::Hyperplane<HpArgumentId,
 
     // compute manhattan distance of iteration space to determine
     // as:  hp_len = l0 + l1 + l2 + ...
-    idx_t hp_len = segment_length<HpArgumentId>(data) +
-                   foldl(RAJA::operators::plus<idx_t>(),
-                                 segment_length<Args>(data)...);
+    idx_t hp_len =
+        segment_length<HpArgumentId>(data) +
+        foldl(RAJA::operators::plus<idx_t>(), segment_length<Args>(data)...);
 
     /* Execute the outer loop over hyperplanes
      *
@@ -146,10 +144,8 @@ struct StatementExecutor<statement::Hyperplane<HpArgumentId,
      * arguments actual value (and restrict to valid hyperplane indices)
      */
     auto r = resources::get_resource<HpExecPolicy>::type::get_default();
-    forall_impl(r, HpExecPolicy{},
-                TypedRangeSegment<idx_t>(0, hp_len),
-                outer_wrapper,
-                RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, HpExecPolicy {}, TypedRangeSegment<idx_t>(0, hp_len),
+                outer_wrapper, RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
@@ -159,27 +155,30 @@ template <camp::idx_t HpArgumentId,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    HyperplaneInner<HpArgumentId, ArgList<Args...>, EnclosedStmts...>, Types> {
+    HyperplaneInner<HpArgumentId, ArgList<Args...>, EnclosedStmts...>,
+    Types>
+{
 
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &data)
+  static RAJA_INLINE void exec(Data& data)
   {
 
     // get h value
-    auto h = camp::get<HpArgumentId>(data.offset_tuple);
+    auto h      = camp::get<HpArgumentId>(data.offset_tuple);
     using idx_t = decltype(h);
 
     // compute actual iterate for HpArgumentId
     // as:  i0 = h - (i1 + i2 + i3 + ...)
     idx_t i = h - foldl(RAJA::operators::plus<idx_t>(),
-                                camp::get<Args>(data.offset_tuple)...);
+                        camp::get<Args>(data.offset_tuple)...);
 
     // get length of Hp indexed argument
     auto len = segment_length<HpArgumentId>(data);
 
     // check bounds
-    if (i >= 0 && i < len) {
+    if (i >= 0 && i < len)
+    {
 
       // store in tuple
       data.template assign_offset<HpArgumentId>(i);
diff --git a/include/RAJA/pattern/kernel/InitLocalMem.hpp b/include/RAJA/pattern/kernel/InitLocalMem.hpp
index 21d9e3cd2a..25bd0a10df 100644
--- a/include/RAJA/pattern/kernel/InitLocalMem.hpp
+++ b/include/RAJA/pattern/kernel/InitLocalMem.hpp
@@ -26,7 +26,7 @@
 namespace RAJA
 {
 
-//Policies for RAJA local arrays
+// Policies for RAJA local arrays
 struct cpu_tile_mem;
 
 
@@ -43,14 +43,16 @@ namespace statement
  * IntiLocalMem<Pol, RAJA::param_idx<0>, statements...>
  * Will intialize the 0th array in the param tuple
  */
-template<typename Pol, typename Indices, typename... EnclosedStmts>
-struct InitLocalMem : public internal::Statement<camp::nil> {
-};
+template <typename Pol, typename Indices, typename... EnclosedStmts>
+struct InitLocalMem : public internal::Statement<camp::nil>
+{};
 
-//Policy Specialization
-template<camp::idx_t... Indices, typename... EnclosedStmts>
-struct InitLocalMem<RAJA::cpu_tile_mem, camp::idx_seq<Indices...>, EnclosedStmts...> : public internal::Statement<camp::nil> {
-};
+// Policy Specialization
+template <camp::idx_t... Indices, typename... EnclosedStmts>
+struct InitLocalMem<RAJA::cpu_tile_mem,
+                    camp::idx_seq<Indices...>,
+                    EnclosedStmts...> : public internal::Statement<camp::nil>
+{};
 
 
 }  // end namespace statement
@@ -58,28 +60,33 @@ struct InitLocalMem<RAJA::cpu_tile_mem, camp::idx_seq<Indices...>, EnclosedStmts
 namespace internal
 {
 
-//Statement executor to initalize RAJA local array
-template<camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
-struct StatementExecutor<statement::InitLocalMem<RAJA::cpu_tile_mem,camp::idx_seq<Indices...>, EnclosedStmts...>, Types>{
-  
-  //Execute statement list
-  template<class Data>
-  static void RAJA_INLINE exec_expanded(Data && data)
+// Statement executor to initalize RAJA local array
+template <camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
+struct StatementExecutor<statement::InitLocalMem<RAJA::cpu_tile_mem,
+                                                 camp::idx_seq<Indices...>,
+                                                 EnclosedStmts...>,
+                         Types>
+{
+
+  // Execute statement list
+  template <class Data>
+  static void RAJA_INLINE exec_expanded(Data&& data)
   {
     execute_statement_list<camp::list<EnclosedStmts...>, Types>(data);
   }
-  
-  //Intialize local array
-  //Identifies type + number of elements needed
-  template<camp::idx_t Pos, camp::idx_t... others, class Data>
-  static void RAJA_INLINE exec_expanded(Data && data)
+
+  // Intialize local array
+  // Identifies type + number of elements needed
+  template <camp::idx_t Pos, camp::idx_t... others, class Data>
+  static void RAJA_INLINE exec_expanded(Data&& data)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    using varType = typename camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
 
     // Initialize memory
 #ifdef RAJA_COMPILER_MSVC
     // MSVC doesn't like taking a pointer to stack allocated data?!?!
-    varType *ptr = new varType[camp::get<Pos>(data.param_tuple).size()];
+    varType* ptr = new varType[camp::get<Pos>(data.param_tuple).size()];
     camp::get<Pos>(data.param_tuple).set_data(ptr);
 #else
     varType Array[camp::get<Pos>(data.param_tuple).size()];
@@ -95,16 +102,14 @@ struct StatementExecutor<statement::InitLocalMem<RAJA::cpu_tile_mem,camp::idx_se
     delete[] ptr;
 #endif
   }
-  
 
-  
-  template<typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+
+  template <typename Data>
+  static RAJA_INLINE void exec(Data&& data)
   {
-    //Initalize local arrays + execute statements + cleanup
+    // Initalize local arrays + execute statements + cleanup
     exec_expanded<Indices...>(data);
   }
-  
 };
 
 
diff --git a/include/RAJA/pattern/kernel/Lambda.hpp b/include/RAJA/pattern/kernel/Lambda.hpp
index 29d41b431e..d9b87bf3d1 100644
--- a/include/RAJA/pattern/kernel/Lambda.hpp
+++ b/include/RAJA/pattern/kernel/Lambda.hpp
@@ -46,28 +46,28 @@ struct lambda_arg_param_t
 struct lambda_arg_offset_t
 {};
 
-template<typename T>
+template <typename T>
 struct lambda_arg_value_t
 {
-    using type = T;
+  using type = T;
 };
 
-template<typename T, camp::idx_t V>
+template <typename T, camp::idx_t V>
 struct LambdaArg
 {
-    static constexpr camp::idx_t value = V;
+  static constexpr camp::idx_t value = V;
 };
 
-}
-
+}  // namespace internal
 
 
 /*!
  * Used in RAJA::statement::Lambda to specify that one or more segment values
  * should be passed into the lambda as an argument
  */
-template<camp::idx_t ... args>
-using Segs = camp::list<internal::LambdaArg<internal::lambda_arg_seg_t, args>...>;
+template <camp::idx_t... args>
+using Segs =
+    camp::list<internal::LambdaArg<internal::lambda_arg_seg_t, args>...>;
 
 /*!
  * Used in RAJA::statement::Lambda to specify that one or more segment offsets
@@ -79,16 +79,18 @@ using Segs = camp::list<internal::LambdaArg<internal::lambda_arg_seg_t, args>...
  * In the case of tiling (with Tile) the offset is w.r.t. the beginning of the
  * current tile.
  */
-template<camp::idx_t ... args>
-using Offsets = camp::list<internal::LambdaArg<internal::lambda_arg_offset_t, args>...>;
+template <camp::idx_t... args>
+using Offsets =
+    camp::list<internal::LambdaArg<internal::lambda_arg_offset_t, args>...>;
 
 
 /*!
  * Used in RAJA::statement::Lambda to specify that one or more parameters that
  * should be passed into the lambda as an argument.
  */
-template<camp::idx_t ... args>
-using Params = camp::list<internal::LambdaArg<internal::lambda_arg_param_t, args>...>;
+template <camp::idx_t... args>
+using Params =
+    camp::list<internal::LambdaArg<internal::lambda_arg_param_t, args>...>;
 
 /*!
  * Used in RAJA::statement::Lambda to specify that one or more constant values
@@ -103,8 +105,9 @@ using Params = camp::list<internal::LambdaArg<internal::lambda_arg_param_t, args
  * writing:   Lambda<0, ValuesT<double, 3, 4>>
  * invokes:   lambda0( (double)3, (double) 4 )
  */
-template<typename T, camp::idx_t ... values>
-using ValuesT = camp::list<internal::LambdaArg<internal::lambda_arg_value_t<T>, values>...>;
+template <typename T, camp::idx_t... values>
+using ValuesT =
+    camp::list<internal::LambdaArg<internal::lambda_arg_value_t<T>, values>...>;
 
 
 namespace statement
@@ -119,8 +122,9 @@ namespace statement
  * RAJA::kernel<exec_pol>(make_tuple{s0, s1, s2}, lambda0, lambda1);
  *
  */
-template <camp::idx_t BodyIdx, typename... Args >
-struct Lambda : internal::Statement<camp::nil> {
+template <camp::idx_t BodyIdx, typename... Args>
+struct Lambda : internal::Statement<camp::nil>
+{
   static const camp::idx_t loop_body_index = BodyIdx;
 };
 
@@ -130,13 +134,6 @@ namespace internal
 {
 
 
-
-
-
-
-
-
-
 /*
  * Helper that extracts a segment value for a lambda argument
  *
@@ -146,26 +143,23 @@ namespace internal
  * This class allows specialization on the segment type in LoopTypes so that
  * fancier constructions can happen (ie vector_exec, etc.)
  */
-template<typename SegmentType, camp::idx_t id>
+template <typename SegmentType, camp::idx_t id>
 struct LambdaSegExtractor
 {
 
-  static_assert(!std::is_same<SegmentType, void>::value,
+  static_assert(
+      !std::is_same<SegmentType, void>::value,
       "Segment not assigned, but used in Lambda with Segs<> argument");
 
-  template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static SegmentType extract(Data &&data)
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static SegmentType extract(Data&& data)
   {
-    return SegmentType(camp::get<id>(data.segment_tuple).begin()[camp::get<id>(data.offset_tuple)]);
+    return SegmentType(camp::get<id>(data.segment_tuple)
+                           .begin()[camp::get<id>(data.offset_tuple)]);
   }
-
 };
 
 
-
 /*
  * Helper that extracts a segment value for a lambda argument
  *
@@ -175,26 +169,22 @@ struct LambdaSegExtractor
  * This class allows specialization on the segment type in LoopTypes so that
  * fancier constructions can happen (ie vector_exec, etc.)
  */
-template<typename OffsetType, camp::idx_t id>
+template <typename OffsetType, camp::idx_t id>
 struct LambdaOffsetExtractor
 {
 
-  static_assert(!std::is_same<OffsetType, void>::value,
+  static_assert(
+      !std::is_same<OffsetType, void>::value,
       "Segment not assigned, but used in Lambda with Offsets<> argument");
 
-  template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static OffsetType extract(Data &&data)
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static OffsetType extract(Data&& data)
   {
     return OffsetType(camp::get<id>(data.offset_tuple));
   }
-
 };
 
 
-
 /*
  * Helper that provides first level of argument extraction
  * This acts as a switchboard between Segs, Offsets, and Params
@@ -202,140 +192,140 @@ struct LambdaOffsetExtractor
  * It calls LambdaArgExtractor to perform the actual argument extraction.
  * This allows LambdaArgExtractor to be specialized
  */
-template<typename Types, typename T>
+template <typename Types, typename T>
 struct LambdaArgSwitchboard;
 
 
-template<typename Types, camp::idx_t id>
+template <typename Types, camp::idx_t id>
 struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_offset_t, id>>
 {
 
   using OffsetType = camp::at_v<typename Types::offset_types_t, id>;
 
-  static_assert(!std::is_same<OffsetType, void>::value,
+  static_assert(
+      !std::is_same<OffsetType, void>::value,
       "Offset not assigned, but used in Lambda with Offsets<> argument");
 
-  template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static OffsetType extract(Data &&data)
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static OffsetType extract(Data&& data)
   {
-    return LambdaOffsetExtractor<OffsetType, id>::extract(std::forward<Data>(data));
+    return LambdaOffsetExtractor<OffsetType, id>::extract(
+        std::forward<Data>(data));
   }
-
 };
 
-template<typename Types, camp::idx_t id>
+template <typename Types, camp::idx_t id>
 struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_seg_t, id>>
 {
 
   using SegmentType = camp::at_v<typename Types::segment_types_t, id>;
 
-  static_assert(!std::is_same<SegmentType, void>::value,
+  static_assert(
+      !std::is_same<SegmentType, void>::value,
       "Segment not assigned, but used in Lambda with Segs<> argument");
 
-  template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static SegmentType extract(Data &&data)
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static SegmentType extract(Data&& data)
   {
-    return LambdaSegExtractor<SegmentType, id>::extract(std::forward<Data>(data));
+    return LambdaSegExtractor<SegmentType, id>::extract(
+        std::forward<Data>(data));
   }
-
 };
 
-template<typename Types, camp::idx_t id>
+template <typename Types, camp::idx_t id>
 struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_param_t, id>>
 {
-  template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static auto extract(Data &&data)->
-    typename std::add_lvalue_reference<camp::tuple_element_t<id,typename camp::decay<Data>::param_tuple_t>>::type
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static auto
+  extract(Data&& data) -> typename std::add_lvalue_reference<
+      camp::tuple_element_t<id,
+                            typename camp::decay<Data>::param_tuple_t>>::type
   {
     return camp::get<id>(data.param_tuple);
   }
 };
 
 
-template<typename Types, typename T, camp::idx_t value>
+template <typename Types, typename T, camp::idx_t value>
 struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_value_t<T>, value>>
 {
-  template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static T extract(Data &&)
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static T extract(Data&&)
   {
     return T(value);
   }
 };
 
 
-
 RAJA_SUPPRESS_HD_WARN
-template<camp::idx_t LoopIndex, typename Types, typename Data, typename... targLists>
-RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda_with_args(Data &&data,
-                                                       camp::list<targLists...> const &)
+template <camp::idx_t LoopIndex,
+          typename Types,
+          typename Data,
+          typename... targLists>
+RAJA_INLINE RAJA_HOST_DEVICE void
+invoke_lambda_with_args(Data&& data, camp::list<targLists...> const&)
 {
   camp::get<LoopIndex>(data.bodies)(
       LambdaArgSwitchboard<Types, targLists>::extract(data)...);
 }
 
 
-
-
 /*!
  * A RAJA::kernel statement that invokes a lambda function
  * with user specified arguments.
  */
-template <camp::idx_t LambdaIndex,typename... Args, typename Types>
-struct StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types> {
+template <camp::idx_t LambdaIndex, typename... Args, typename Types>
+struct StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(Data &&data)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(Data&& data)
   {
 
-    //Convert SegList, ParamList into Seg, Param types, and store in a list
+    // Convert SegList, ParamList into Seg, Param types, and store in a list
     using targList = typename camp::flatten<camp::list<Args...>>::type;
 
-    invoke_lambda_with_args<LambdaIndex, Types>(std::forward<Data>(data), targList{});
+    invoke_lambda_with_args<LambdaIndex, Types>(std::forward<Data>(data),
+                                                targList {});
   }
 };
 
 
-
-template <camp::idx_t LambdaIndex, typename Types, typename Data, camp::idx_t ... SEGS, camp::idx_t ... PARAMS>
-RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda(Data &&data, camp::idx_seq<SEGS...> const &, camp::idx_seq<PARAMS...> const &)
+template <camp::idx_t LambdaIndex,
+          typename Types,
+          typename Data,
+          camp::idx_t... SEGS,
+          camp::idx_t... PARAMS>
+RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda(Data&& data,
+                                                camp::idx_seq<SEGS...> const&,
+                                                camp::idx_seq<PARAMS...> const&)
 {
 
-  using AllSegs = Segs<SEGS...>;
+  using AllSegs   = Segs<SEGS...>;
   using AllParams = Params<PARAMS...>;
 
   // invoke the expanded Lambda executor, passing in all segments and params
-  StatementExecutor<statement::Lambda<LambdaIndex, AllSegs, AllParams>, Types>::exec(std::forward<Data>(data));
+  StatementExecutor<statement::Lambda<LambdaIndex, AllSegs, AllParams>,
+                    Types>::exec(std::forward<Data>(data));
 }
 
 
 template <camp::idx_t LambdaIndex, typename Types>
-struct StatementExecutor<statement::Lambda<LambdaIndex>, Types> {
+struct StatementExecutor<statement::Lambda<LambdaIndex>, Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(Data &&data)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(Data&& data)
   {
 
-    using Data_t = camp::decay<Data>;
+    using Data_t         = camp::decay<Data>;
     using offset_tuple_t = typename Data_t::offset_tuple_t;
-    using param_tuple_t = typename Data_t::param_tuple_t;
+    using param_tuple_t  = typename Data_t::param_tuple_t;
 
     invoke_lambda<LambdaIndex, Types>(
         std::forward<Data>(data),
-        camp::make_idx_seq_t<camp::tuple_size<offset_tuple_t>::value>{},
-        camp::make_idx_seq_t<camp::tuple_size<param_tuple_t>::value>{});
-
+        camp::make_idx_seq_t<camp::tuple_size<offset_tuple_t>::value> {},
+        camp::make_idx_seq_t<camp::tuple_size<param_tuple_t>::value> {});
   }
 };
 
diff --git a/include/RAJA/pattern/kernel/Param.hpp b/include/RAJA/pattern/kernel/Param.hpp
index 8e870ebe15..999e1a9ebe 100644
--- a/include/RAJA/pattern/kernel/Param.hpp
+++ b/include/RAJA/pattern/kernel/Param.hpp
@@ -31,10 +31,10 @@ namespace RAJA
 namespace internal
 {
 
-struct ParamBase {
-};
+struct ParamBase
+{};
 
-}// end namespace internal
+}  // end namespace internal
 
 namespace statement
 {
@@ -47,12 +47,13 @@ namespace statement
  * RAJA::kernel execution policies.
  */
 template <camp::idx_t ParamId>
-struct Param : public internal::ParamBase {
+struct Param : public internal::ParamBase
+{
 
   constexpr static camp::idx_t param_idx = ParamId;
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static auto eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static auto eval(Data const& data)
       -> decltype(camp::get<ParamId>(data.param_tuple))
   {
     return camp::get<ParamId>(data.param_tuple);
diff --git a/include/RAJA/pattern/kernel/Reduce.hpp b/include/RAJA/pattern/kernel/Reduce.hpp
index 4de4922ea3..db45d2dfe4 100644
--- a/include/RAJA/pattern/kernel/Reduce.hpp
+++ b/include/RAJA/pattern/kernel/Reduce.hpp
@@ -39,10 +39,12 @@ namespace statement
  *
  */
 template <typename ReducePolicy,
-          template <typename...> class ReduceOperator,
+          template <typename...>
+          class ReduceOperator,
           typename ParamId,
           typename... EnclosedStmts>
-struct Reduce : public internal::Statement<camp::nil, EnclosedStmts...> {
+struct Reduce : public internal::Statement<camp::nil, EnclosedStmts...>
+{
 
   static_assert(std::is_base_of<internal::ParamBase, ParamId>::value,
                 "Inappropriate ParamId, ParamId must be of type "
diff --git a/include/RAJA/pattern/kernel/Region.hpp b/include/RAJA/pattern/kernel/Region.hpp
index 82b79ae775..700df61199 100644
--- a/include/RAJA/pattern/kernel/Region.hpp
+++ b/include/RAJA/pattern/kernel/Region.hpp
@@ -30,9 +30,9 @@ namespace RAJA
 namespace statement
 {
 
-template<typename RegionPolicy, typename... EnclosedStmts>
-struct Region : public internal::Statement<camp::nil> {
-};
+template <typename RegionPolicy, typename... EnclosedStmts>
+struct Region : public internal::Statement<camp::nil>
+{};
 
 
 }  // end namespace statement
@@ -40,23 +40,27 @@ struct Region : public internal::Statement<camp::nil> {
 namespace internal
 {
 
-//Statement executor to create a region within kernel
-
-//Note: RAJA region's lambda must capture by reference otherwise
-//internal function calls are undefined.
-template<typename RegionPolicy, typename... EnclosedStmts, typename Types>
-struct StatementExecutor<statement::Region<RegionPolicy, EnclosedStmts...>, Types> {
+// Statement executor to create a region within kernel
 
-template<typename Data>
-static RAJA_INLINE void exec(Data &&data)
+// Note: RAJA region's lambda must capture by reference otherwise
+// internal function calls are undefined.
+template <typename RegionPolicy, typename... EnclosedStmts, typename Types>
+struct StatementExecutor<statement::Region<RegionPolicy, EnclosedStmts...>,
+                         Types>
 {
 
-  RAJA::region<RegionPolicy>([&]() {
-      using data_t = camp::decay<Data>;
-      execute_statement_list<camp::list<EnclosedStmts...>, Types>(data_t(data));
-    });
-}
-
+  template <typename Data>
+  static RAJA_INLINE void exec(Data&& data)
+  {
+
+    RAJA::region<RegionPolicy>(
+        [&]()
+        {
+          using data_t = camp::decay<Data>;
+          execute_statement_list<camp::list<EnclosedStmts...>, Types>(
+              data_t(data));
+        });
+  }
 };
 
 
diff --git a/include/RAJA/pattern/kernel/Tile.hpp b/include/RAJA/pattern/kernel/Tile.hpp
index 43f72e0545..3b3b3e689d 100644
--- a/include/RAJA/pattern/kernel/Tile.hpp
+++ b/include/RAJA/pattern/kernel/Tile.hpp
@@ -34,14 +34,13 @@
 namespace RAJA
 {
 
-struct TileSize {
+struct TileSize
+{
   const camp::idx_t size;
 
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  constexpr TileSize(camp::idx_t size_) : size{size_}
-  {
-  }
+  constexpr TileSize(camp::idx_t size_) : size {size_} {}
 };
 
 namespace statement
@@ -56,7 +55,8 @@ template <camp::idx_t ArgumentId,
           typename TilePolicy,
           typename ExecPolicy,
           typename... EnclosedStmts>
-struct Tile : public internal::Statement<ExecPolicy, EnclosedStmts...> {
+struct Tile : public internal::Statement<ExecPolicy, EnclosedStmts...>
+{
   using tile_policy_t = TilePolicy;
   using exec_policy_t = ExecPolicy;
 };
@@ -65,17 +65,18 @@ struct Tile : public internal::Statement<ExecPolicy, EnclosedStmts...> {
 
 ///! tag for a tiling loop
 template <camp::idx_t chunk_size_>
-struct tile_fixed {
+struct tile_fixed
+{
   static constexpr camp::idx_t chunk_size = chunk_size_;
 };
 
 template <camp::idx_t ArgumentId>
-struct tile_dynamic {
+struct tile_dynamic
+{
   static constexpr camp::idx_t id = ArgumentId;
 };
 
 
-
 namespace internal
 {
 
@@ -84,8 +85,12 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <camp::idx_t ArgumentId, typename Data, typename Types, typename... EnclosedStmts>
-struct TileWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
+template <camp::idx_t ArgumentId,
+          typename Data,
+          typename Types,
+          typename... EnclosedStmts>
+struct TileWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
+{
 
   using Base = GenericWrapper<Data, Types, EnclosedStmts...>;
   using Base::Base;
@@ -104,7 +109,8 @@ struct TileWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
 
 
 template <typename Iterable>
-struct IterableTiler {
+struct IterableTiler
+{
   using value_type = camp::decay<Iterable>;
 
   struct iterate
@@ -120,46 +126,45 @@ struct IterableTiler {
     const Index_type block_id;
 
   public:
-    using value_type = iterate;
-    using difference_type = camp::idx_t;
-    using pointer = value_type *;
-    using reference = value_type &;
+    using value_type        = iterate;
+    using difference_type   = camp::idx_t;
+    using pointer           = value_type*;
+    using reference         = value_type&;
     using iterator_category = std::random_access_iterator_tag;
 
     RAJA_HOST_DEVICE
     RAJA_INLINE
-    constexpr iterator(IterableTiler const &itiler_, Index_type block_id_)
-        : itiler{itiler_}, block_id{block_id_}
-    {
-    }
+    constexpr iterator(IterableTiler const& itiler_, Index_type block_id_)
+        : itiler {itiler_}, block_id {block_id_}
+    {}
 
     RAJA_HOST_DEVICE
     RAJA_INLINE
     value_type operator*()
     {
       auto start = block_id * itiler.block_size;
-      return iterate{itiler.it.slice(start, itiler.block_size), block_id};
+      return iterate {itiler.it.slice(start, itiler.block_size), block_id};
     }
 
     RAJA_HOST_DEVICE
-    RAJA_INLINE difference_type operator-(const iterator &rhs) const
+    RAJA_INLINE difference_type operator-(const iterator& rhs) const
     {
       return static_cast<difference_type>(block_id) -
              static_cast<difference_type>(rhs.block_id);
     }
 
     RAJA_HOST_DEVICE
-    RAJA_INLINE iterator operator-(const difference_type &rhs) const
+    RAJA_INLINE iterator operator-(const difference_type& rhs) const
     {
       return iterator(itiler, block_id - rhs);
     }
 
     RAJA_HOST_DEVICE
-    RAJA_INLINE iterator operator+(const difference_type &rhs) const
+    RAJA_INLINE iterator operator+(const difference_type& rhs) const
     {
-      return iterator(itiler,
-                      block_id + rhs >= itiler.num_blocks ? itiler.num_blocks
-                                                          : block_id + rhs);
+      return iterator(itiler, block_id + rhs >= itiler.num_blocks
+                                  ? itiler.num_blocks
+                                  : block_id + rhs);
     }
 
     RAJA_HOST_DEVICE
@@ -169,13 +174,13 @@ struct IterableTiler {
     }
 
     RAJA_HOST_DEVICE
-    RAJA_INLINE bool operator!=(const iterator &rhs) const
+    RAJA_INLINE bool operator!=(const iterator& rhs) const
     {
       return block_id != rhs.block_id;
     }
 
     RAJA_HOST_DEVICE
-    RAJA_INLINE bool operator<(const iterator &rhs) const
+    RAJA_INLINE bool operator<(const iterator& rhs) const
     {
       return block_id < rhs.block_id;
     }
@@ -183,16 +188,17 @@ struct IterableTiler {
 
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  IterableTiler(const Iterable &it_, camp::idx_t block_size_)
-      : it{it_}, block_size{block_size_}
+  IterableTiler(const Iterable& it_, camp::idx_t block_size_)
+      : it {it_}, block_size {block_size_}
   {
     using std::begin;
     using std::distance;
     using std::end;
-    dist = it.end() - it.begin();  // distance(begin(it), end(it));
+    dist       = it.end() - it.begin();  // distance(begin(it), end(it));
     num_blocks = dist / block_size;
     // if (dist % block_size) num_blocks += 1;
-    if (dist - num_blocks * block_size > 0) {
+    if (dist - num_blocks * block_size > 0)
+    {
       num_blocks += 1;
     }
   }
@@ -222,13 +228,15 @@ template <camp::idx_t ArgumentId,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    statement::Tile<ArgumentId, tile_fixed<ChunkSize>, EPol, EnclosedStmts...>, Types> {
+    statement::Tile<ArgumentId, tile_fixed<ChunkSize>, EPol, EnclosedStmts...>,
+    Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &data)
+  static RAJA_INLINE void exec(Data& data)
   {
     // Get the segment we are going to tile
-    auto const &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto const& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Get the tiling policies chunk size
     auto chunk_size = tile_fixed<ChunkSize>::chunk_size;
@@ -238,47 +246,51 @@ struct StatementExecutor<
     IterableTiler<decltype(segment)> tiled_iterable(segment, chunk_size);
 
     // Wrap in case forall_impl needs to thread_privatize
-    TileWrapper<ArgumentId, Data, Types,
-                EnclosedStmts...> tile_wrapper(data);
+    TileWrapper<ArgumentId, Data, Types, EnclosedStmts...> tile_wrapper(data);
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper, RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, EPol {}, tiled_iterable, tile_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
 
     // Set range back to original values
     camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
   }
 };
 
-template<camp::idx_t ArgumentId,
-  typename EPol,
-  typename... EnclosedStmts,
-  typename Types>
+template <camp::idx_t ArgumentId,
+          typename EPol,
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<
-    statement::Tile<ArgumentId, tile_dynamic<ArgumentId>, EPol, EnclosedStmts...>, Types> {
+    statement::
+        Tile<ArgumentId, tile_dynamic<ArgumentId>, EPol, EnclosedStmts...>,
+    Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &data)
+  static RAJA_INLINE void exec(Data& data)
   {
     // Get the segment we are going to tile
-    auto const &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto const& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Get the tiling policies chunk size
     auto chunk_size = camp::get<ArgumentId>(data.param_tuple);
-    static_assert(camp::concepts::metalib::is_same<TileSize, decltype(chunk_size)>::value,
-                  "Extracted parameter must be of type TileSize.");
+    static_assert(
+        camp::concepts::metalib::is_same<TileSize, decltype(chunk_size)>::value,
+        "Extracted parameter must be of type TileSize.");
 
     // Create a tile iterator
     IterableTiler<decltype(segment)> tiled_iterable(segment, chunk_size.size);
 
     // Wrap in case forall_impl needs to thread_privatize
-    TileWrapper<ArgumentId, Data, Types,
-                EnclosedStmts...> tile_wrapper(data);
+    TileWrapper<ArgumentId, Data, Types, EnclosedStmts...> tile_wrapper(data);
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper, RAJA::expt::get_empty_forall_param_pack());
-    
+    forall_impl(r, EPol {}, tiled_iterable, tile_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
+
     // Set range back to original values
     camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
   }
diff --git a/include/RAJA/pattern/kernel/TileTCount.hpp b/include/RAJA/pattern/kernel/TileTCount.hpp
index 2653e992c7..d741e0a4b0 100644
--- a/include/RAJA/pattern/kernel/TileTCount.hpp
+++ b/include/RAJA/pattern/kernel/TileTCount.hpp
@@ -47,7 +47,8 @@ template <camp::idx_t ArgumentId,
           typename TilePolicy,
           typename ExecPolicy,
           typename... EnclosedStmts>
-struct TileTCount : public internal::Statement<ExecPolicy, EnclosedStmts...> {
+struct TileTCount : public internal::Statement<ExecPolicy, EnclosedStmts...>
+{
   static_assert(std::is_base_of<internal::ParamBase, ParamId>::value,
                 "Inappropriate ParamId, ParamId must be of type "
                 "RAJA::Statement::Param< # >");
@@ -66,9 +67,13 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <camp::idx_t ArgumentId, typename ParamId, typename Data, typename Types,
+template <camp::idx_t ArgumentId,
+          typename ParamId,
+          typename Data,
+          typename Types,
           typename... EnclosedStmts>
-struct TileTCountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
+struct TileTCountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
+{
 
   using Base = GenericWrapper<Data, Types, EnclosedStmts...>;
   using Base::Base;
@@ -79,17 +84,16 @@ struct TileTCountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
   {
     // Assign the tile's segment to the tuple
     camp::get<ArgumentId>(Base::data.segment_tuple) = si.s;
-    
+
     // Assign the tile's index
     Base::data.template assign_param<ParamId>(si.i);
-    
+
     // Execute enclosed statements
     Base::exec();
   }
 };
 
 
-
 /*!
  * A generic RAJA::kernel forall_impl executor for statement::TileTCount
  *
@@ -102,14 +106,16 @@ template <camp::idx_t ArgumentId,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    statement::TileTCount<ArgumentId, ParamId, TPol, EPol, EnclosedStmts...>, Types> {
+    statement::TileTCount<ArgumentId, ParamId, TPol, EPol, EnclosedStmts...>,
+    Types>
+{
 
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &data)
+  static RAJA_INLINE void exec(Data& data)
   {
     // Get the segment we are going to tile
-    auto const &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto const& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Get the tiling policies chunk size
     auto chunk_size = TPol::chunk_size;
@@ -119,12 +125,13 @@ struct StatementExecutor<
     IterableTiler<decltype(segment)> tiled_iterable(segment, chunk_size);
 
     // Wrap in case forall_impl needs to thread_privatize
-    TileTCountWrapper<ArgumentId, ParamId, Data, Types,
-                      EnclosedStmts...> tile_wrapper(data);
+    TileTCountWrapper<ArgumentId, ParamId, Data, Types, EnclosedStmts...>
+        tile_wrapper(data);
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper, RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, EPol {}, tiled_iterable, tile_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
 
     // Set range back to original values
     camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
diff --git a/include/RAJA/pattern/kernel/internal/LoopData.hpp b/include/RAJA/pattern/kernel/internal/LoopData.hpp
index 9667a55538..08f72ab91f 100644
--- a/include/RAJA/pattern/kernel/internal/LoopData.hpp
+++ b/include/RAJA/pattern/kernel/internal/LoopData.hpp
@@ -40,29 +40,27 @@ namespace internal
 {
 
 
-
-
-  // Universal base of all For wrappers for type traits
-  struct ForList {
-  };
-  struct ForBase {
-  };
-  struct CollapseBase {
-  };
-  template <camp::idx_t ArgumentId, typename Policy>
-  struct ForTraitBase : public ForBase {
-    constexpr static camp::idx_t index_val = ArgumentId;
-    using index = camp::num<ArgumentId>;
-    using index_type = camp::nil;  // default to invalid type
-    using policy_type = Policy;
-    using type = ForTraitBase;  // make camp::value compatible
-  };
-
-
+// Universal base of all For wrappers for type traits
+struct ForList
+{};
+struct ForBase
+{};
+struct CollapseBase
+{};
+template <camp::idx_t ArgumentId, typename Policy>
+struct ForTraitBase : public ForBase
+{
+  constexpr static camp::idx_t index_val = ArgumentId;
+  using index                            = camp::num<ArgumentId>;
+  using index_type  = camp::nil;  // default to invalid type
+  using policy_type = Policy;
+  using type        = ForTraitBase;  // make camp::value compatible
+};
 
 
 template <typename Iterator>
-struct iterable_difftype_getter {
+struct iterable_difftype_getter
+{
   using type = typename std::iterator_traits<
       typename Iterator::iterator>::difference_type;
 };
@@ -79,7 +77,8 @@ using difftype_tuple_from_segments =
 
 
 template <typename Iterator>
-struct iterable_value_type_getter {
+struct iterable_value_type_getter
+{
   using type =
       typename std::iterator_traits<typename Iterator::iterator>::value_type;
 };
@@ -100,13 +99,12 @@ using index_types_from_segments =
                            value_type_list_from_segments<Segments>>::type;
 
 
-
-
 template <typename SegmentTuple,
           typename ParamTuple,
           typename Resource,
           typename... Bodies>
-struct LoopData {
+struct LoopData
+{
 
   using Self = LoopData<SegmentTuple, ParamTuple, Resource, Bodies...>;
 
@@ -138,78 +136,70 @@ struct LoopData {
   using vector_sizes_t = tuple_of_n<int, camp::tuple_size<SegmentTuple>::value>;
   vector_sizes_t vector_sizes;
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr
-  LoopData(SegmentTuple const &s, ParamTuple const &p, Resource r, Bodies const &... b)
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr LoopData(SegmentTuple const& s,
+                                                  ParamTuple const& p,
+                                                  Resource r,
+                                                  Bodies const&... b)
       : segment_tuple(s), param_tuple(p), res(r), bodies(b...)
-  {
-  }
-  constexpr LoopData(LoopData const &) = default;
-  constexpr LoopData(LoopData &&) = default;
+  {}
+  constexpr LoopData(LoopData const&) = default;
+  constexpr LoopData(LoopData&&)      = default;
 
   template <camp::idx_t Idx, typename IndexT>
-  RAJA_HOST_DEVICE RAJA_INLINE void assign_offset(IndexT const &i)
+  RAJA_HOST_DEVICE RAJA_INLINE void assign_offset(IndexT const& i)
   {
     camp::get<Idx>(offset_tuple) = i;
   }
 
   template <typename ParamId, typename IndexT>
-  RAJA_HOST_DEVICE RAJA_INLINE void assign_param(IndexT const &i)
+  RAJA_HOST_DEVICE RAJA_INLINE void assign_param(IndexT const& i)
   {
-    using param_t = camp::at_v<typename param_tuple_t::TList, ParamId::param_idx>;
+    using param_t =
+        camp::at_v<typename param_tuple_t::TList, ParamId::param_idx>;
     camp::get<ParamId::param_idx>(param_tuple) = param_t(i);
   }
 
   template <typename ParamId>
-  RAJA_HOST_DEVICE RAJA_INLINE
-  auto get_param() ->
-    camp::at_v<typename param_tuple_t::TList, ParamId::param_idx>
+  RAJA_HOST_DEVICE RAJA_INLINE auto get_param()
+      -> camp::at_v<typename param_tuple_t::TList, ParamId::param_idx>
   {
     return camp::get<ParamId::param_idx>(param_tuple);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE
-  Resource get_resource()
-  {
-    return res;
-  }
-
-
+  RAJA_HOST_DEVICE RAJA_INLINE Resource get_resource() { return res; }
 };
 
 
-
-
 template <camp::idx_t ArgumentId, typename Data>
-using segment_diff_type =
-    typename std::iterator_traits<
-        typename camp::at_v<typename Data::segment_tuple_t::TList,
-                            ArgumentId>::iterator>::difference_type;
-
-
+using segment_diff_type = typename std::iterator_traits<
+    typename camp::at_v<typename Data::segment_tuple_t::TList,
+                        ArgumentId>::iterator>::difference_type;
 
 
 template <camp::idx_t ArgumentId, typename Data>
-RAJA_INLINE RAJA_HOST_DEVICE auto segment_length(Data const &data) ->
-  segment_diff_type<ArgumentId, Data>
+RAJA_INLINE RAJA_HOST_DEVICE auto segment_length(Data const& data)
+    -> segment_diff_type<ArgumentId, Data>
 {
   return camp::get<ArgumentId>(data.segment_tuple).end() -
          camp::get<ArgumentId>(data.segment_tuple).begin();
 }
 
 
-
-
 template <typename Data, typename Types, typename... EnclosedStmts>
-struct GenericWrapper : GenericWrapperBase {
+struct GenericWrapper : GenericWrapperBase
+{
   using data_t = camp::decay<Data>;
 
-  data_t &data;
+  data_t& data;
 
   RAJA_INLINE
-  constexpr explicit GenericWrapper(data_t &d) : data{d} {}
+  constexpr explicit GenericWrapper(data_t& d) : data {d} {}
 
   RAJA_INLINE
-  void exec() { execute_statement_list<camp::list<EnclosedStmts...>, Types>(data); }
+  void exec()
+  {
+    execute_statement_list<camp::list<EnclosedStmts...>, Types>(data);
+  }
 };
 
 
@@ -217,26 +207,25 @@ struct GenericWrapper : GenericWrapperBase {
  * Convenience object used to create a thread-private LoopData object.
  */
 template <typename T>
-struct NestedPrivatizer {
-  using data_t = typename T::data_t;
-  using value_type = camp::decay<T>;
-  using reference_type = value_type &;
+struct NestedPrivatizer
+{
+  using data_t         = typename T::data_t;
+  using value_type     = camp::decay<T>;
+  using reference_type = value_type&;
 
   data_t privatized_data;
   value_type privatized_wrapper;
 
   RAJA_INLINE
-  constexpr NestedPrivatizer(const T &o)
-      : privatized_data{o.data}, privatized_wrapper(privatized_data)
-  {
-  }
+  constexpr NestedPrivatizer(const T& o)
+      : privatized_data {o.data}, privatized_wrapper(privatized_data)
+  {}
 
   RAJA_INLINE
   reference_type get_priv() { return privatized_wrapper; }
 };
 
 
-
 }  // end namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/pattern/kernel/internal/LoopTypes.hpp b/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
index 7f77df4214..0f334c542b 100644
--- a/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
+++ b/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
@@ -29,63 +29,71 @@ namespace internal
 {
 
 
-template <typename SegmentTypes,
-          typename OffsetTypes>
+template <typename SegmentTypes, typename OffsetTypes>
 struct LoopTypes;
 
-template <typename ... SegmentTypes,
-          typename ... OffsetTypes>
-struct LoopTypes<camp::list<SegmentTypes...>, camp::list<OffsetTypes...>> {
+template <typename... SegmentTypes, typename... OffsetTypes>
+struct LoopTypes<camp::list<SegmentTypes...>, camp::list<OffsetTypes...>>
+{
 
-  using Self = LoopTypes<camp::list<SegmentTypes...>, camp::list<OffsetTypes...>>;
+  using Self =
+      LoopTypes<camp::list<SegmentTypes...>, camp::list<OffsetTypes...>>;
 
   static constexpr size_t s_num_segments = sizeof...(SegmentTypes);
 
   // This ensures that you don't double-loop over a segment within the same
   // loop nesting
   static_assert(s_num_segments == sizeof...(OffsetTypes),
-      "Number of segments and offsets must match");
+                "Number of segments and offsets must match");
 
   using segment_types_t = camp::list<SegmentTypes...>;
-  using offset_types_t = camp::list<OffsetTypes...>;
+  using offset_types_t  = camp::list<OffsetTypes...>;
 };
 
 
-template<typename Data>
-using makeInitialLoopTypes =
-    LoopTypes<list_of_n<void, camp::tuple_size<typename Data::segment_tuple_t>::value>,
-              list_of_n<void, camp::tuple_size<typename Data::segment_tuple_t>::value>>;
+template <typename Data>
+using makeInitialLoopTypes = LoopTypes<
+    list_of_n<void, camp::tuple_size<typename Data::segment_tuple_t>::value>,
+    list_of_n<void, camp::tuple_size<typename Data::segment_tuple_t>::value>>;
 
 
-template<typename Types, camp::idx_t Segment, typename T, typename Seq>
+template <typename Types, camp::idx_t Segment, typename T, typename Seq>
 struct SetSegmentTypeHelper;
 
-template<typename Types,
-         camp::idx_t Segment,
-         typename T,
-         camp::idx_t ... SEQ>
+template <typename Types, camp::idx_t Segment, typename T, camp::idx_t... SEQ>
 struct SetSegmentTypeHelper<Types, Segment, T, camp::idx_seq<SEQ...>>
 {
-    using segment_list = typename Types::segment_types_t;
-    using offset_list = typename Types::offset_types_t;
-
-    static_assert(std::is_same<camp::at_v<segment_list, Segment>, void>::value,
-        "Segment was already assigned: Probably looping over same segment in loop nest");
-
-    using type = LoopTypes<
-        camp::list<typename std::conditional<SEQ == Segment, T, camp::at_v<segment_list, SEQ>>::type...>,
-        camp::list<typename std::conditional<SEQ == Segment, T, camp::at_v<segment_list, SEQ>>::type...>>;
-
+  using segment_list = typename Types::segment_types_t;
+  using offset_list  = typename Types::offset_types_t;
+
+  static_assert(std::is_same<camp::at_v<segment_list, Segment>, void>::value,
+                "Segment was already assigned: Probably looping over same "
+                "segment in loop nest");
+
+  using type = LoopTypes<
+      camp::list<
+          typename std::conditional<SEQ == Segment,
+                                    T,
+                                    camp::at_v<segment_list, SEQ>>::type...>,
+      camp::list<
+          typename std::conditional<SEQ == Segment,
+                                    T,
+                                    camp::at_v<segment_list, SEQ>>::type...>>;
 };
 
 
-template<typename Types, camp::idx_t Segment, typename T>
-using setSegmentType =
-    typename SetSegmentTypeHelper<Types, Segment, T, camp::make_idx_seq_t<Types::s_num_segments>>::type;
+template <typename Types, camp::idx_t Segment, typename T>
+using setSegmentType = typename SetSegmentTypeHelper<
+    Types,
+    Segment,
+    T,
+    camp::make_idx_seq_t<Types::s_num_segments>>::type;
 
-template<typename Types, camp::idx_t Segment, typename Data>
-using setSegmentTypeFromData =
-    setSegmentType<Types, Segment, camp::at_v<typename camp::decay<Data>::index_types_t, Segment>>;
+template <typename Types, camp::idx_t Segment, typename Data>
+using setSegmentTypeFromData = setSegmentType<
+    Types,
+    Segment,
+    camp::at_v<typename camp::decay<Data>::index_types_t, Segment>>;
 
 
 }  // end namespace internal
diff --git a/include/RAJA/pattern/kernel/internal/Statement.hpp b/include/RAJA/pattern/kernel/internal/Statement.hpp
index 48ca828a68..c0402edad9 100644
--- a/include/RAJA/pattern/kernel/internal/Statement.hpp
+++ b/include/RAJA/pattern/kernel/internal/Statement.hpp
@@ -28,25 +28,24 @@ namespace internal
 {
 
 
-
 template <typename ExecPolicy, typename... EnclosedStmts>
-struct Statement {
-  static_assert(std::is_same<ExecPolicy, camp::nil>::value || sizeof...(EnclosedStmts) > 0,
-      "Executable statement with no enclosed statements, this is almost certainly a bug");
+struct Statement
+{
+  static_assert(std::is_same<ExecPolicy, camp::nil>::value ||
+                    sizeof...(EnclosedStmts) > 0,
+                "Executable statement with no enclosed statements, this is "
+                "almost certainly a bug");
   Statement() = delete;
 
   using enclosed_statements_t = StatementList<EnclosedStmts...>;
-  using execution_policy_t = ExecPolicy;
+  using execution_policy_t    = ExecPolicy;
 };
 
 
-
-
 template <typename Policy, typename Types>
 struct StatementExecutor;
 
 
-
 }  // end namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/pattern/kernel/internal/StatementList.hpp b/include/RAJA/pattern/kernel/internal/StatementList.hpp
index 5c0d71afb4..f0e5cd5175 100644
--- a/include/RAJA/pattern/kernel/internal/StatementList.hpp
+++ b/include/RAJA/pattern/kernel/internal/StatementList.hpp
@@ -35,8 +35,6 @@ template <typename Policy, typename Types>
 struct StatementExecutor;
 
 
-
-
 template <typename... Stmts>
 using StatementList = camp::list<Stmts...>;
 
@@ -47,11 +45,13 @@ struct StatementListExecutor;
 
 template <camp::idx_t statement_index,
           camp::idx_t num_statements,
-          typename StmtList, typename Types>
-struct StatementListExecutor {
+          typename StmtList,
+          typename Types>
+struct StatementListExecutor
+{
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE void exec(Data&& data)
   {
 
     // Get the statement we're going to execute
@@ -61,8 +61,8 @@ struct StatementListExecutor {
     StatementExecutor<statement, Types>::exec(std::forward<Data>(data));
 
     // call our next statement
-    StatementListExecutor<statement_index + 1, num_statements, StmtList, Types>::exec(
-        std::forward<Data>(data));
+    StatementListExecutor<statement_index + 1, num_statements, StmtList,
+                          Types>::exec(std::forward<Data>(data));
   }
 };
 
@@ -72,24 +72,23 @@ struct StatementListExecutor {
  */
 
 template <camp::idx_t num_statements, typename StmtList, typename Types>
-struct StatementListExecutor<num_statements, num_statements, StmtList, Types> {
+struct StatementListExecutor<num_statements, num_statements, StmtList, Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&)
-  {
-  }
+  static RAJA_INLINE void exec(Data&&)
+  {}
 };
 
 
 template <typename StmtList, typename Types, typename Data>
-RAJA_INLINE void execute_statement_list(Data &&data)
+RAJA_INLINE void execute_statement_list(Data&& data)
 {
   StatementListExecutor<0, camp::size<StmtList>::value, StmtList, Types>::exec(
       std::forward<Data>(data));
 }
 
 
-
 }  // end namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/pattern/kernel/internal/Template.hpp b/include/RAJA/pattern/kernel/internal/Template.hpp
index c750b95986..7771ae99ee 100644
--- a/include/RAJA/pattern/kernel/internal/Template.hpp
+++ b/include/RAJA/pattern/kernel/internal/Template.hpp
@@ -39,8 +39,8 @@ struct SeqToType
 template <typename T, typename SEQ>
 struct ListOfNHelper;
 
-template <typename T, camp::idx_t ... SEQ>
-struct ListOfNHelper<T, camp::idx_seq<SEQ...> >
+template <typename T, camp::idx_t... SEQ>
+struct ListOfNHelper<T, camp::idx_seq<SEQ...>>
 {
   using type = camp::list<typename SeqToType<T, SEQ>::type...>;
 };
@@ -49,13 +49,13 @@ struct ListOfNHelper<T, camp::idx_seq<SEQ...> >
 template <typename T, typename SEQ>
 struct TupleOfNHelper;
 
-template <typename T, camp::idx_t ... SEQ>
-struct TupleOfNHelper<T, camp::idx_seq<SEQ...> >
+template <typename T, camp::idx_t... SEQ>
+struct TupleOfNHelper<T, camp::idx_seq<SEQ...>>
 {
   using type = camp::tuple<typename SeqToType<T, SEQ>::type...>;
 };
 
-} // namespace detail
+}  // namespace detail
 
 /*
  *  This creates a camp::list with N types, each one being T.
@@ -64,7 +64,8 @@ struct TupleOfNHelper<T, camp::idx_seq<SEQ...> >
  *
  */
 template <typename T, camp::idx_t N>
-using list_of_n = typename detail::ListOfNHelper<T, camp::make_idx_seq_t<N>>::type;
+using list_of_n =
+    typename detail::ListOfNHelper<T, camp::make_idx_seq_t<N>>::type;
 
 
 /*
@@ -74,8 +75,8 @@ using list_of_n = typename detail::ListOfNHelper<T, camp::make_idx_seq_t<N>>::ty
  *
  */
 template <typename T, camp::idx_t N>
-using tuple_of_n = typename detail::TupleOfNHelper<T, camp::make_idx_seq_t<N>>::type;
-
+using tuple_of_n =
+    typename detail::TupleOfNHelper<T, camp::make_idx_seq_t<N>>::type;
 
 
 }  // end namespace internal
diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index f1d70aeacb..453dc75a1c 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -28,8 +28,8 @@
 #include "camp/concepts.hpp"
 #include "camp/tuple.hpp"
 
-//Odd dependecy with atomics is breaking CI builds
-//#include "RAJA/util/View.hpp"
+// Odd dependecy with atomics is breaking CI builds
+// #include "RAJA/util/View.hpp"
 
 #if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) && !defined(RAJA_ENABLE_SYCL)
 #define RAJA_TEAM_SHARED __shared__
@@ -41,12 +41,17 @@ namespace RAJA
 {
 
 // GPU or CPU threads available
-//strongly type the ExecPlace (guards agaist errors)
-enum struct ExecPlace : int { HOST, DEVICE, NUM_PLACES };
-
-struct null_launch_t {
+// strongly type the ExecPlace (guards agaist errors)
+enum struct ExecPlace : int
+{
+  HOST,
+  DEVICE,
+  NUM_PLACES
 };
 
+struct null_launch_t
+{};
+
 // Support for host, and device
 template <typename HOST_POLICY
 #if defined(RAJA_GPU_ACTIVE)
@@ -55,7 +60,8 @@ template <typename HOST_POLICY
 #endif
           >
 
-struct LoopPolicy {
+struct LoopPolicy
+{
   using host_policy_t = HOST_POLICY;
 #if defined(RAJA_GPU_ACTIVE)
   using device_policy_t = DEVICE_POLICY;
@@ -68,7 +74,8 @@ template <typename HOST_POLICY
           typename DEVICE_POLICY = HOST_POLICY
 #endif
           >
-struct LaunchPolicy {
+struct LaunchPolicy
+{
   using host_policy_t = HOST_POLICY;
 #if defined(RAJA_GPU_ACTIVE)
   using device_policy_t = DEVICE_POLICY;
@@ -76,48 +83,51 @@ struct LaunchPolicy {
 };
 
 
-struct Teams {
+struct Teams
+{
   int value[3];
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Teams() : value{1, 1, 1} {}
+  constexpr Teams() : value {1, 1, 1} {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Teams(int i) : value{i, 1, 1} {}
+  constexpr Teams(int i) : value {i, 1, 1} {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Teams(int i, int j) : value{i, j, 1} {}
+  constexpr Teams(int i, int j) : value {i, j, 1} {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Teams(int i, int j, int k) : value{i, j, k} {}
+  constexpr Teams(int i, int j, int k) : value {i, j, k} {}
 };
 
-struct Threads {
+struct Threads
+{
   int value[3];
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Threads() : value{1, 1, 1} {}
+  constexpr Threads() : value {1, 1, 1} {}
 
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Threads(int i) : value{i, 1, 1} {}
+  constexpr Threads(int i) : value {i, 1, 1} {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Threads(int i, int j) : value{i, j, 1} {}
+  constexpr Threads(int i, int j) : value {i, j, 1} {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Threads(int i, int j, int k) : value{i, j, k} {}
+  constexpr Threads(int i, int j, int k) : value {i, j, k} {}
 };
 
-struct Lanes {
+struct Lanes
+{
   int value;
 
   RAJA_INLINE
@@ -129,7 +139,8 @@ struct Lanes {
   constexpr Lanes(int i) : value(i) {}
 };
 
-struct LaunchParams {
+struct LaunchParams
+{
 public:
   Teams teams;
   Threads threads;
@@ -138,67 +149,71 @@ struct LaunchParams {
   RAJA_INLINE
   LaunchParams() = default;
 
-  LaunchParams(Teams in_teams, Threads in_threads, size_t in_shared_mem_size = 0)
-    : teams(in_teams), threads(in_threads), shared_mem_size(in_shared_mem_size) {};
+  LaunchParams(Teams in_teams,
+               Threads in_threads,
+               size_t in_shared_mem_size = 0)
+      : teams(in_teams),
+        threads(in_threads),
+        shared_mem_size(in_shared_mem_size) {};
 
 private:
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  Teams apply(Teams const &a) { return (teams = a); }
+  Teams apply(Teams const& a) { return (teams = a); }
 
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  Threads apply(Threads const &a) { return (threads = a); }
+  Threads apply(Threads const& a) { return (threads = a); }
 };
 
 class LaunchContext
 {
 public:
-
-  //Bump style allocator used to
-  //get memory from the pool
+  // Bump style allocator used to
+  // get memory from the pool
   size_t shared_mem_offset;
 
-  void *shared_mem_ptr;
+  void* shared_mem_ptr;
 
 #if defined(RAJA_ENABLE_SYCL)
-  mutable cl::sycl::nd_item<3> *itm;
+  mutable cl::sycl::nd_item<3>* itm;
 #endif
 
   RAJA_HOST_DEVICE LaunchContext()
-    : shared_mem_offset(0), shared_mem_ptr(nullptr)
-  {
-  }
+      : shared_mem_offset(0), shared_mem_ptr(nullptr)
+  {}
 
-  //TODO handle alignment
-  template<typename T>
+  // TODO handle alignment
+  template <typename T>
   RAJA_HOST_DEVICE T* getSharedMemory(size_t bytes)
   {
 
-    //Calculate offset in bytes with a char pointer
-    void* mem_ptr = static_cast<char *>(shared_mem_ptr) + shared_mem_offset;
+    // Calculate offset in bytes with a char pointer
+    void* mem_ptr = static_cast<char*>(shared_mem_ptr) + shared_mem_offset;
 
-    shared_mem_offset += bytes*sizeof(T);
+    shared_mem_offset += bytes * sizeof(T);
 
-    //convert to desired type
+    // convert to desired type
     return static_cast<T*>(mem_ptr);
   }
 
   /*
   //Odd dependecy with atomics is breaking CI builds
-  template<typename T, size_t DIM, typename IDX_T=RAJA::Index_type, ptrdiff_t z_stride=DIM-1, typename arg, typename... args>
-  RAJA_HOST_DEVICE auto getSharedMemoryView(size_t bytes, arg idx, args... idxs)
+  template<typename T, size_t DIM, typename IDX_T=RAJA::Index_type, ptrdiff_t
+  z_stride=DIM-1, typename arg, typename... args> RAJA_HOST_DEVICE auto
+  getSharedMemoryView(size_t bytes, arg idx, args... idxs)
   {
     T * mem_ptr = &((T*) shared_mem_ptr)[shared_mem_offset];
 
     shared_mem_offset += bytes*sizeof(T);
-    return RAJA::View<T, RAJA::Layout<DIM, IDX_T, z_stride>>(mem_ptr, idx, idxs...);
+    return RAJA::View<T, RAJA::Layout<DIM, IDX_T, z_stride>>(mem_ptr, idx,
+  idxs...);
   }
   */
 
   RAJA_HOST_DEVICE void releaseSharedMemory()
   {
-    //On the cpu/gpu we want to restart the count
+    // On the cpu/gpu we want to restart the count
     shared_mem_offset = 0;
   }
 
@@ -218,19 +233,24 @@ class LaunchContext
 template <typename LAUNCH_POLICY>
 struct LaunchExecute;
 
-//Policy based launch with support to new reducers...
-template <typename LAUNCH_POLICY, typename ... ReduceParams>
-void launch(LaunchParams const &launch_params, const char *kernel_name, ReduceParams&&... rest_of_launch_args)
+// Policy based launch with support to new reducers...
+template <typename LAUNCH_POLICY, typename... ReduceParams>
+void launch(LaunchParams const& launch_params,
+            const char* kernel_name,
+            ReduceParams&&... rest_of_launch_args)
 {
 
-  //Get reducers
-  auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
+  // Get reducers
+  auto reducers = expt::make_forall_param_pack(
+      std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
+  auto&& launch_body =
+      expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  //Take the first policy as we assume the second policy is not user defined.
-  //We rely on the user to pair launch and loop policies correctly.
-  util::PluginContext context{util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
+  // Take the first policy as we assume the second policy is not user defined.
+  // We rely on the user to pair launch and loop policies correctly.
+  util::PluginContext context {
+      util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -242,29 +262,36 @@ void launch(LaunchParams const &launch_params, const char *kernel_name, ReducePa
 
   using launch_t = LaunchExecute<typename LAUNCH_POLICY::host_policy_t>;
 
-  using Res = typename resources::get_resource<typename LAUNCH_POLICY::host_policy_t>::type;
+  using Res = typename resources::get_resource<
+      typename LAUNCH_POLICY::host_policy_t>::type;
 
-  launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body, reducers);
+  launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body,
+                 reducers);
 
   util::callPostLaunchPlugins(context);
 }
 
 
-//Duplicate of code above on account that we need to support the case in which a kernel_name is not given
-template <typename LAUNCH_POLICY, typename ... ReduceParams>
-void launch(LaunchParams const &launch_params, ReduceParams&&... rest_of_launch_args)
+// Duplicate of code above on account that we need to support the case in which
+// a kernel_name is not given
+template <typename LAUNCH_POLICY, typename... ReduceParams>
+void launch(LaunchParams const& launch_params,
+            ReduceParams&&... rest_of_launch_args)
 {
 
-  const char *kernel_name = nullptr;
+  const char* kernel_name = nullptr;
 
-  //Get reducers
-  auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
+  // Get reducers
+  auto reducers = expt::make_forall_param_pack(
+      std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
+  auto&& launch_body =
+      expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  //Take the first policy as we assume the second policy is not user defined.
-  //We rely on the user to pair launch and loop policies correctly.
-  util::PluginContext context{util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
+  // Take the first policy as we assume the second policy is not user defined.
+  // We rely on the user to pair launch and loop policies correctly.
+  util::PluginContext context {
+      util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -276,148 +303,200 @@ void launch(LaunchParams const &launch_params, ReduceParams&&... rest_of_launch_
 
   using launch_t = LaunchExecute<typename LAUNCH_POLICY::host_policy_t>;
 
-  using Res = typename resources::get_resource<typename LAUNCH_POLICY::host_policy_t>::type;
+  using Res = typename resources::get_resource<
+      typename LAUNCH_POLICY::host_policy_t>::type;
 
-  launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body, reducers);
+  launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body,
+                 reducers);
 
   util::callPostLaunchPlugins(context);
 }
 
 //=================================================
-//Run time based policy launch
+// Run time based policy launch
 //=================================================
 template <typename POLICY_LIST, typename BODY>
-void launch(ExecPlace place, LaunchParams const &params, BODY const &body)
+void launch(ExecPlace place, LaunchParams const& params, BODY const& body)
 {
   launch<POLICY_LIST>(place, params, nullptr, body);
 }
 
 template <typename POLICY_LIST, typename BODY>
-void launch(ExecPlace place, const LaunchParams &params, const char *kernel_name, BODY const &body)
+void launch(ExecPlace place,
+            const LaunchParams& params,
+            const char* kernel_name,
+            BODY const& body)
 {
 
-  //Forward to single policy launch API - simplifies testing of plugins
-  switch (place) {
-    case ExecPlace::HOST: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::host_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>(Res::get_default(), params, kernel_name, body);
-      break;
-    }
+  // Forward to single policy launch API - simplifies testing of plugins
+  switch (place)
+  {
+  case ExecPlace::HOST:
+  {
+    using Res = typename resources::get_resource<
+        typename POLICY_LIST::host_policy_t>::type;
+    launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>(
+        Res::get_default(), params, kernel_name, body);
+    break;
+  }
 #if defined(RAJA_GPU_ACTIVE)
-  case ExecPlace::DEVICE: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::device_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>(Res::get_default(), params, kernel_name, body);
-      break;
-    }
+  case ExecPlace::DEVICE:
+  {
+    using Res = typename resources::get_resource<
+        typename POLICY_LIST::device_policy_t>::type;
+    launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>(
+        Res::get_default(), params, kernel_name, body);
+    break;
+  }
 #endif
-    default:
-      RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
+  default:
+    RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
   }
-
 }
 
-//Run-time API for new reducer interface
+// Run-time API for new reducer interface
 template <typename POLICY_LIST, typename... ReduceParams>
-void launch(ExecPlace place, const LaunchParams &launch_params, const char *kernel_name, ReduceParams&&... rest_of_launch_args)
+void launch(ExecPlace place,
+            const LaunchParams& launch_params,
+            const char* kernel_name,
+            ReduceParams&&... rest_of_launch_args)
 {
 
-  //Forward to single policy launch API - simplifies testing of plugins
-  switch (place) {
-    case ExecPlace::HOST: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::host_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>
-        (Res::get_default(), launch_params, kernel_name, std::forward<ReduceParams>(rest_of_launch_args)...);
-      break;
-    }
+  // Forward to single policy launch API - simplifies testing of plugins
+  switch (place)
+  {
+  case ExecPlace::HOST:
+  {
+    using Res = typename resources::get_resource<
+        typename POLICY_LIST::host_policy_t>::type;
+    launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>(
+        Res::get_default(), launch_params, kernel_name,
+        std::forward<ReduceParams>(rest_of_launch_args)...);
+    break;
+  }
 #if defined(RAJA_GPU_ACTIVE)
-  case ExecPlace::DEVICE: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::device_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>
-        (Res::get_default(), launch_params, kernel_name, std::forward<ReduceParams>(rest_of_launch_args)...);
-      break;
-    }
+  case ExecPlace::DEVICE:
+  {
+    using Res = typename resources::get_resource<
+        typename POLICY_LIST::device_policy_t>::type;
+    launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>(
+        Res::get_default(), launch_params, kernel_name,
+        std::forward<ReduceParams>(rest_of_launch_args)...);
+    break;
+  }
 #endif
-    default:
-      RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
+  default:
+    RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
   }
-
 }
 
-//Run-time API for new reducer interface with support of the case without a new kernel name
+// Run-time API for new reducer interface with support of the case without a new
+// kernel name
 template <typename POLICY_LIST, typename... ReduceParams>
-void launch(ExecPlace place, const LaunchParams &launch_params, ReduceParams&&... rest_of_launch_args)
-            //BODY const &body)
+void launch(ExecPlace place,
+            const LaunchParams& launch_params,
+            ReduceParams&&... rest_of_launch_args)
+// BODY const &body)
 {
 
-  const char *kernel_name = nullptr;
+  const char* kernel_name = nullptr;
 
-  //Forward to single policy launch API - simplifies testing of plugins
-  switch (place) {
-    case ExecPlace::HOST: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::host_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>
-        (Res::get_default(), launch_params, kernel_name, std::forward<ReduceParams>(rest_of_launch_args)...);
-      break;
-    }
+  // Forward to single policy launch API - simplifies testing of plugins
+  switch (place)
+  {
+  case ExecPlace::HOST:
+  {
+    using Res = typename resources::get_resource<
+        typename POLICY_LIST::host_policy_t>::type;
+    launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>(
+        Res::get_default(), launch_params, kernel_name,
+        std::forward<ReduceParams>(rest_of_launch_args)...);
+    break;
+  }
 #if defined(RAJA_GPU_ACTIVE)
-  case ExecPlace::DEVICE: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::device_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>
-        (Res::get_default(), launch_params, kernel_name, std::forward<ReduceParams>(rest_of_launch_args)...);
-      break;
-    }
+  case ExecPlace::DEVICE:
+  {
+    using Res = typename resources::get_resource<
+        typename POLICY_LIST::device_policy_t>::type;
+    launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>(
+        Res::get_default(), launch_params, kernel_name,
+        std::forward<ReduceParams>(rest_of_launch_args)...);
+    break;
+  }
 #endif
-    default:
-      RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
+  default:
+    RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
   }
-
 }
 
 
-// Helper function to retrieve a resource based on the run-time policy - if a device is active
-#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL)
-template<typename T, typename U>
-RAJA::resources::Resource Get_Runtime_Resource(T host_res, U device_res, RAJA::ExecPlace device){
-  if(device == RAJA::ExecPlace::DEVICE) {return RAJA::resources::Resource(device_res);}
-  else { return RAJA::resources::Resource(host_res); }
+// Helper function to retrieve a resource based on the run-time policy - if a
+// device is active
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) ||                   \
+    defined(RAJA_ENABLE_SYCL)
+template <typename T, typename U>
+RAJA::resources::Resource
+Get_Runtime_Resource(T host_res, U device_res, RAJA::ExecPlace device)
+{
+  if (device == RAJA::ExecPlace::DEVICE)
+  {
+    return RAJA::resources::Resource(device_res);
+  }
+  else
+  {
+    return RAJA::resources::Resource(host_res);
+  }
 }
 #endif
 
-template<typename T>
-RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device){
-  if(device == RAJA::ExecPlace::DEVICE) {RAJA_ABORT_OR_THROW("Device is not enabled");}
+template <typename T>
+RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device)
+{
+  if (device == RAJA::ExecPlace::DEVICE)
+  {
+    RAJA_ABORT_OR_THROW("Device is not enabled");
+  }
 
   return RAJA::resources::Resource(host_res);
 }
 
-//Launch API which takes team resource struct and supports new reducers
-template <typename POLICY_LIST, typename ... ReduceParams>
+// Launch API which takes team resource struct and supports new reducers
+template <typename POLICY_LIST, typename... ReduceParams>
 resources::EventProxy<resources::Resource>
-launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
-       const char *kernel_name, ReduceParams&&... rest_of_launch_args)
+launch(RAJA::resources::Resource res,
+       LaunchParams const& launch_params,
+       const char* kernel_name,
+       ReduceParams&&... rest_of_launch_args)
 {
 
-  //Get reducers
-  auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
+  // Get reducers
+  auto reducers = expt::make_forall_param_pack(
+      std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
+  auto&& launch_body =
+      expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
 
   ExecPlace place;
-  if(res.get_platform() == RAJA::Platform::host) {
+  if (res.get_platform() == RAJA::Platform::host)
+  {
     place = RAJA::ExecPlace::HOST;
-  } else {
+  }
+  else
+  {
     place = RAJA::ExecPlace::DEVICE;
   }
 
   //
-  //Configure plugins
+  // Configure plugins
   //
 #if defined(RAJA_GPU_ACTIVE)
-  util::PluginContext context{place == ExecPlace::HOST ?
-      util::make_context<typename POLICY_LIST::host_policy_t>() :
-      util::make_context<typename POLICY_LIST::device_policy_t>()};
+  util::PluginContext context {
+      place == ExecPlace::HOST
+          ? util::make_context<typename POLICY_LIST::host_policy_t>()
+          : util::make_context<typename POLICY_LIST::device_policy_t>()};
 #else
-  util::PluginContext context{util::make_context<typename POLICY_LIST::host_policy_t>()};
+  util::PluginContext context {
+      util::make_context<typename POLICY_LIST::host_policy_t>()};
 #endif
 
   util::callPreCapturePlugins(context);
@@ -429,24 +508,30 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
 
   util::callPreLaunchPlugins(context);
 
-  switch (place) {
-    case ExecPlace::HOST: {
-      using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
-      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
-      util::callPostLaunchPlugins(context);
-      return e_proxy;
-    }
+  switch (place)
+  {
+  case ExecPlace::HOST:
+  {
+    using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
+    resources::EventProxy<resources::Resource> e_proxy =
+        launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
+    util::callPostLaunchPlugins(context);
+    return e_proxy;
+  }
 #if defined(RAJA_GPU_ACTIVE)
-    case ExecPlace::DEVICE: {
-      using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
-      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, kernel_name,  p_body, reducers);
-      util::callPostLaunchPlugins(context);
-      return e_proxy;
-    }
+  case ExecPlace::DEVICE:
+  {
+    using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
+    resources::EventProxy<resources::Resource> e_proxy =
+        launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
+    util::callPostLaunchPlugins(context);
+    return e_proxy;
+  }
 #endif
-    default: {
-      RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
-    }
+  default:
+  {
+    RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
+  }
   }
 
   RAJA_ABORT_OR_THROW("Unknown launch place");
@@ -456,36 +541,45 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
 }
 
 
-//Duplicate of API above on account that we need to handle the case that a kernel name is not provided
-template <typename POLICY_LIST, typename ... ReduceParams>
+// Duplicate of API above on account that we need to handle the case that a
+// kernel name is not provided
+template <typename POLICY_LIST, typename... ReduceParams>
 resources::EventProxy<resources::Resource>
-launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
+launch(RAJA::resources::Resource res,
+       LaunchParams const& launch_params,
        ReduceParams&&... rest_of_launch_args)
 {
 
-  const char *kernel_name = nullptr;
+  const char* kernel_name = nullptr;
 
-  //Get reducers
-  auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
+  // Get reducers
+  auto reducers = expt::make_forall_param_pack(
+      std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
+  auto&& launch_body =
+      expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
 
   ExecPlace place;
-  if(res.get_platform() == RAJA::Platform::host) {
+  if (res.get_platform() == RAJA::Platform::host)
+  {
     place = RAJA::ExecPlace::HOST;
-  } else {
+  }
+  else
+  {
     place = RAJA::ExecPlace::DEVICE;
   }
 
   //
-  //Configure plugins
+  // Configure plugins
   //
 #if defined(RAJA_GPU_ACTIVE)
-  util::PluginContext context{place == ExecPlace::HOST ?
-      util::make_context<typename POLICY_LIST::host_policy_t>() :
-      util::make_context<typename POLICY_LIST::device_policy_t>()};
+  util::PluginContext context {
+      place == ExecPlace::HOST
+          ? util::make_context<typename POLICY_LIST::host_policy_t>()
+          : util::make_context<typename POLICY_LIST::device_policy_t>()};
 #else
-  util::PluginContext context{util::make_context<typename POLICY_LIST::host_policy_t>()};
+  util::PluginContext context {
+      util::make_context<typename POLICY_LIST::host_policy_t>()};
 #endif
 
   util::callPreCapturePlugins(context);
@@ -497,24 +591,30 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
 
   util::callPreLaunchPlugins(context);
 
-  switch (place) {
-    case ExecPlace::HOST: {
-      using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
-      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
-      util::callPostLaunchPlugins(context);
-      return e_proxy;
-    }
+  switch (place)
+  {
+  case ExecPlace::HOST:
+  {
+    using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
+    resources::EventProxy<resources::Resource> e_proxy =
+        launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
+    util::callPostLaunchPlugins(context);
+    return e_proxy;
+  }
 #if defined(RAJA_GPU_ACTIVE)
-    case ExecPlace::DEVICE: {
-      using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
-      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
-      util::callPostLaunchPlugins(context);
-      return e_proxy;
-    }
+  case ExecPlace::DEVICE:
+  {
+    using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
+    resources::EventProxy<resources::Resource> e_proxy =
+        launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
+    util::callPostLaunchPlugins(context);
+    return e_proxy;
+  }
 #endif
-    default: {
-      RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
-    }
+  default:
+  {
+    RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
+  }
   }
 
   RAJA_ABORT_OR_THROW("Unknown launch place");
@@ -523,7 +623,7 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
   return resources::EventProxy<resources::Resource>(res);
 }
 
-template<typename POLICY_LIST>
+template <typename POLICY_LIST>
 #if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
 using loop_policy = typename POLICY_LIST::device_policy_t;
 #else
@@ -541,28 +641,23 @@ template <typename POLICY_LIST,
           typename CONTEXT,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const &ctx,
-                                       SEGMENT const &segment,
-                                       BODY const &body)
+RAJA_HOST_DEVICE RAJA_INLINE void
+loop(CONTEXT const& ctx, SEGMENT const& segment, BODY const& body)
 {
 
-  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                       segment,
-                                                       body);
+  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, segment, body);
 }
 
 template <typename POLICY_LIST,
           typename CONTEXT,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const &ctx,
-                                          SEGMENT const &segment,
-                                          BODY const &body)
+RAJA_HOST_DEVICE RAJA_INLINE void
+loop_icount(CONTEXT const& ctx, SEGMENT const& segment, BODY const& body)
 {
 
-  LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                          segment,
-                                                          body);
+  LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, segment,
+                                                             body);
 }
 
 namespace expt
@@ -573,15 +668,13 @@ template <typename POLICY_LIST,
           typename CONTEXT,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const &ctx,
-                                       SEGMENT const &segment0,
-                                       SEGMENT const &segment1,
-                                       BODY const &body)
+RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const& ctx,
+                                       SEGMENT const& segment0,
+                                       SEGMENT const& segment1,
+                                       BODY const& body)
 {
 
-  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                       segment0,
-                                                       segment1,
+  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, segment0, segment1,
                                                        body);
 }
 
@@ -590,18 +683,15 @@ template <typename POLICY_LIST,
           typename CONTEXT,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const &ctx,
-                                       SEGMENT const &segment0,
-                                       SEGMENT const &segment1,
-                                       SEGMENT const &segment2,
-                                       BODY const &body)
+RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const& ctx,
+                                       SEGMENT const& segment0,
+                                       SEGMENT const& segment1,
+                                       SEGMENT const& segment2,
+                                       BODY const& body)
 {
 
-  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                       segment0,
-                                                       segment1,
-                                                       segment2,
-                                                       body);
+  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, segment0, segment1,
+                                                       segment2, body);
 }
 
 RAJA_SUPPRESS_HD_WARN
@@ -609,18 +699,18 @@ template <typename POLICY_LIST,
           typename CONTEXT,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const &ctx,
-                                       SEGMENT const &segment0,
-                                       SEGMENT const &segment1,
-                                       SEGMENT const &segment2,
-                                       BODY const &body)
+RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const& ctx,
+                                              SEGMENT const& segment0,
+                                              SEGMENT const& segment1,
+                                              SEGMENT const& segment2,
+                                              BODY const& body)
 {
 
-  LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                           segment0, segment1, segment2, body);
+  LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
+      ctx, segment0, segment1, segment2, body);
 }
 
-} //namespace expt
+}  // namespace expt
 
 template <typename POLICY, typename SEGMENT>
 struct TileExecute;
@@ -633,15 +723,13 @@ template <typename POLICY_LIST,
           typename TILE_T,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const &ctx,
+RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const& ctx,
                                        TILE_T tile_size,
-                                       SEGMENT const &segment,
-                                       BODY const &body)
+                                       SEGMENT const& segment,
+                                       BODY const& body)
 {
 
-  TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                       tile_size,
-                                                       segment,
+  TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, tile_size, segment,
                                                        body);
 }
 
@@ -650,15 +738,13 @@ template <typename POLICY_LIST,
           typename TILE_T,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const &ctx,
-                                       TILE_T tile_size,
-                                       SEGMENT const &segment,
-                                       BODY const &body)
+RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const& ctx,
+                                              TILE_T tile_size,
+                                              SEGMENT const& segment,
+                                              BODY const& body)
 {
-  TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                          tile_size,
-                                                          segment,
-                                                          body);
+  TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, tile_size,
+                                                             segment, body);
 }
 
 namespace expt
@@ -669,20 +755,16 @@ template <typename POLICY_LIST,
           typename TILE_T,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const &ctx,
+RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const& ctx,
                                        TILE_T tile_size0,
                                        TILE_T tile_size1,
-                                       SEGMENT const &segment0,
-                                       SEGMENT const &segment1,
-                                       BODY const &body)
+                                       SEGMENT const& segment0,
+                                       SEGMENT const& segment1,
+                                       BODY const& body)
 {
 
-  TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                       tile_size0,
-                                                       tile_size1,
-                                                       segment0,
-                                                       segment1,
-                                                       body);
+  TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
+      ctx, tile_size0, tile_size1, segment0, segment1, body);
 }
 
 template <typename POLICY_LIST,
@@ -690,23 +772,19 @@ template <typename POLICY_LIST,
           typename TILE_T,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const &ctx,
-                                       TILE_T tile_size0,
-                                       TILE_T tile_size1,
-                                       SEGMENT const &segment0,
-                                       SEGMENT const &segment1,
-                                       BODY const &body)
+RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const& ctx,
+                                              TILE_T tile_size0,
+                                              TILE_T tile_size1,
+                                              SEGMENT const& segment0,
+                                              SEGMENT const& segment1,
+                                              BODY const& body)
 {
 
-  TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                          tile_size0,
-                                                          tile_size1,
-                                                          segment0,
-                                                          segment1,
-                                                          body);
+  TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
+      ctx, tile_size0, tile_size1, segment0, segment1, body);
 }
 
-} //namespace expt
+}  // namespace expt
 
 }  // namespace RAJA
 #endif
diff --git a/include/RAJA/pattern/multi_reduce.hpp b/include/RAJA/pattern/multi_reduce.hpp
index 3fbe36877c..ca3f4e58d0 100644
--- a/include/RAJA/pattern/multi_reduce.hpp
+++ b/include/RAJA/pattern/multi_reduce.hpp
@@ -156,7 +156,7 @@ struct MultiReduceSum;
  */
 template <typename MULTI_REDUCE_POLICY_T, typename T>
 struct MultiReduceBitOr;
- 
+
 
 /*!
  ******************************************************************************
@@ -171,7 +171,8 @@ struct MultiReduceBitOr;
    Index_ptr bins = ...;
    Real_ptr bit_vals = ...;
 
-   MultiReduceBitAnd<multi_reduce_policy, Real_type> my_bits(num_bins, init_val);
+   MultiReduceBitAnd<multi_reduce_policy, Real_type> my_bits(num_bins,
+ init_val);
 
    forall<exec_policy>( ..., [=] (Index_type i) {
       my_bits[bins[i]] &= (data[i]);
@@ -188,7 +189,7 @@ struct MultiReduceBitOr;
 template <typename MULTI_REDUCE_POLICY_T, typename T>
 struct MultiReduceBitAnd;
 
-} //namespace RAJA
+}  // namespace RAJA
 
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/params/forall.hpp b/include/RAJA/pattern/params/forall.hpp
index 5a656206f5..0cb36f597c 100644
--- a/include/RAJA/pattern/params/forall.hpp
+++ b/include/RAJA/pattern/params/forall.hpp
@@ -21,348 +21,436 @@ namespace RAJA
 namespace expt
 {
 
-  //
-  //
-  // Forall Parameter Packing type
-  //
-  //
-  struct ParamMultiplexer;
-
-  template<typename... Params>
-  struct ForallParamPack {
-
-    friend struct ParamMultiplexer;
-
-    using Base = camp::tuple<Params...>;
-    Base param_tup;
-
-    static constexpr size_t param_tup_sz = camp::tuple_size<Base>::value; 
-    using params_seq = camp::make_idx_seq_t< param_tup_sz >;
-
-  private:
-
-    // Init
-    template<typename EXEC_POL, camp::idx_t... Seq, typename ...Args>
-    static constexpr void detail_init(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& f_params, Args&& ...args) {
-      CAMP_EXPAND(expt::detail::init<EXEC_POL>( camp::get<Seq>(f_params.param_tup), std::forward<Args>(args)... ));
-    }
-
-    // Combine
-    template<typename EXEC_POL, camp::idx_t... Seq>
-    RAJA_HOST_DEVICE
-    static constexpr void detail_combine(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& out, const ForallParamPack& in ) {
-      CAMP_EXPAND(detail::combine<EXEC_POL>( camp::get<Seq>(out.param_tup), camp::get<Seq>(in.param_tup)));
-    }
-
-    template<typename EXEC_POL, camp::idx_t... Seq>
-    RAJA_HOST_DEVICE
-    static constexpr void detail_combine(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& f_params ) {
-      CAMP_EXPAND(detail::combine<EXEC_POL>( camp::get<Seq>(f_params.param_tup) ));
-    }
-    
-    // Resolve
-    template<typename EXEC_POL, camp::idx_t... Seq, typename ...Args>
-    static constexpr void detail_resolve(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& f_params, Args&& ...args) {
-      CAMP_EXPAND(detail::resolve<EXEC_POL>( camp::get<Seq>(f_params.param_tup), std::forward<Args>(args)... ));
-    }
-
-    // Used to construct the argument TYPES that will be invoked with the lambda.
-    template<typename null_t = camp::nil>
-    static constexpr auto LAMBDA_ARG_TUP_T() { return camp::tuple<>{}; };
-    template<typename null_t = camp::nil, typename First>
-    static constexpr auto LAMBDA_ARG_TUP_T() { return typename First::ARG_TUP_T(); };
-    template<typename null_t = camp::nil, typename First, typename Second, typename... Rest>
-    static constexpr auto LAMBDA_ARG_TUP_T() { return camp::tuple_cat_pair(typename First::ARG_TUP_T(), LAMBDA_ARG_TUP_T<camp::nil, Second, Rest...>()); };
-
-    using lambda_arg_tuple_t = decltype(LAMBDA_ARG_TUP_T<camp::nil, Params...>());
-    
-    //Use the size of param_tup to generate the argument list.
-    RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<0>) { return camp::make_tuple(); }
-    RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<1>) { return camp::get<param_tup_sz - 1>(param_tup).get_lambda_arg_tup(); }
-    template<camp::idx_t N>
-    RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<N>) {
-      return camp::tuple_cat_pair(  camp::get<param_tup_sz - N>(param_tup).get_lambda_arg_tup(), LAMBDA_ARG_TUP_V(camp::num<N-1>())  );
-    }
-
-  public:
-    ForallParamPack(){}
-
-    RAJA_HOST_DEVICE constexpr lambda_arg_tuple_t lambda_args() {return LAMBDA_ARG_TUP_V(camp::num<sizeof...(Params)>());}
-
-    using lambda_arg_seq = camp::make_idx_seq_t<camp::tuple_size<lambda_arg_tuple_t>::value>;
-
-    template<typename... Ts>
-    ForallParamPack(camp::tuple<Ts...>&& t) : param_tup(std::move(t)) {};
-  }; // struct ForallParamPack 
-  
-
-
-  //===========================================================================
-  //
-  //
-  // ParamMultiplexer is how we hook into the individual calls within forall_impl.
-  //
-  //
-  struct ParamMultiplexer {
-    template<typename EXEC_POL, typename... Params, typename ...Args, typename FP = ForallParamPack<Params...>>
-    static void constexpr init( ForallParamPack<Params...>& f_params, Args&& ...args) {
-      FP::detail_init(EXEC_POL(),typename FP::params_seq(), f_params, std::forward<Args>(args)... );
-    }
-    template<typename EXEC_POL, typename... Params, typename ...Args, typename FP = ForallParamPack<Params...>>
-    static void constexpr combine(ForallParamPack<Params...>& f_params, Args&& ...args){
-      FP::detail_combine(EXEC_POL(), typename FP::params_seq(), f_params, std::forward<Args>(args)... );
-    }
-    template<typename EXEC_POL, typename... Params, typename ...Args, typename FP = ForallParamPack<Params...>>
-    static void constexpr resolve( ForallParamPack<Params...>& f_params, Args&& ...args){
-      FP::detail_resolve(EXEC_POL(), typename FP::params_seq(), f_params, std::forward<Args>(args)... );
-    }
-  };
-  //===========================================================================
+//
+//
+// Forall Parameter Packing type
+//
+//
+struct ParamMultiplexer;
+
+template <typename... Params>
+struct ForallParamPack
+{
+
+  friend struct ParamMultiplexer;
 
+  using Base = camp::tuple<Params...>;
+  Base param_tup;
 
+  static constexpr size_t param_tup_sz = camp::tuple_size<Base>::value;
+  using params_seq                     = camp::make_idx_seq_t<param_tup_sz>;
+
+private:
+  // Init
+  template <typename EXEC_POL, camp::idx_t... Seq, typename... Args>
+  static constexpr void detail_init(EXEC_POL,
+                                    camp::idx_seq<Seq...>,
+                                    ForallParamPack& f_params,
+                                    Args&&... args)
+  {
+    CAMP_EXPAND(expt::detail::init<EXEC_POL>(camp::get<Seq>(f_params.param_tup),
+                                             std::forward<Args>(args)...));
+  }
+
+  // Combine
+  template <typename EXEC_POL, camp::idx_t... Seq>
+  RAJA_HOST_DEVICE static constexpr void
+  detail_combine(EXEC_POL,
+                 camp::idx_seq<Seq...>,
+                 ForallParamPack& out,
+                 const ForallParamPack& in)
+  {
+    CAMP_EXPAND(detail::combine<EXEC_POL>(camp::get<Seq>(out.param_tup),
+                                          camp::get<Seq>(in.param_tup)));
+  }
 
-  //===========================================================================
-  //
-  //
-  // ForallParamPack generators.
-  //
-  //
-  RAJA_INLINE static auto get_empty_forall_param_pack(){
-    static ForallParamPack<> p;
-    return p;
+  template <typename EXEC_POL, camp::idx_t... Seq>
+  RAJA_HOST_DEVICE static constexpr void
+  detail_combine(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& f_params)
+  {
+    CAMP_EXPAND(detail::combine<EXEC_POL>(camp::get<Seq>(f_params.param_tup)));
   }
 
-  namespace detail {
-    // all_true trick to perform variadic expansion in static asserts.
-    // https://stackoverflow.com/questions/36933176/how-do-you-static-assert-the-values-in-a-parameter-pack-of-a-variadic-template
-    template<bool...> struct bool_pack;
-    template<bool... bs>
-    using all_true = std::is_same<bool_pack<bs..., true>, bool_pack<true, bs...>>;
+  // Resolve
+  template <typename EXEC_POL, camp::idx_t... Seq, typename... Args>
+  static constexpr void detail_resolve(EXEC_POL,
+                                       camp::idx_seq<Seq...>,
+                                       ForallParamPack& f_params,
+                                       Args&&... args)
+  {
+    CAMP_EXPAND(detail::resolve<EXEC_POL>(camp::get<Seq>(f_params.param_tup),
+                                          std::forward<Args>(args)...));
+  }
 
-    template<typename Base, typename... Ts>
-    using check_types_derive_base = all_true<std::is_convertible<Ts, Base>::value...>;
-  } // namespace detail
+  // Used to construct the argument TYPES that will be invoked with the lambda.
+  template <typename null_t = camp::nil>
+  static constexpr auto LAMBDA_ARG_TUP_T()
+  {
+    return camp::tuple<> {};
+  };
+  template <typename null_t = camp::nil, typename First>
+  static constexpr auto LAMBDA_ARG_TUP_T()
+  {
+    return typename First::ARG_TUP_T();
+  };
+  template <typename null_t = camp::nil,
+            typename First,
+            typename Second,
+            typename... Rest>
+  static constexpr auto LAMBDA_ARG_TUP_T()
+  {
+    return camp::tuple_cat_pair(typename First::ARG_TUP_T(),
+                                LAMBDA_ARG_TUP_T<camp::nil, Second, Rest...>());
+  };
 
+  using lambda_arg_tuple_t = decltype(LAMBDA_ARG_TUP_T<camp::nil, Params...>());
 
-  template<typename... Ts>
-  constexpr auto make_forall_param_pack_from_tuple(camp::tuple<Ts...>&& tuple) {
-    static_assert(detail::check_types_derive_base<detail::ForallParamBase, camp::decay<Ts>...>::value,
-        "Forall optional arguments do not derive ForallParamBase. Please see Reducer, ReducerLoc and KernelName for examples.") ;
-    return ForallParamPack<camp::decay<Ts>...>(std::move(tuple));
+  // Use the size of param_tup to generate the argument list.
+  RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<0>)
+  {
+    return camp::make_tuple();
+  }
+  RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<1>)
+  {
+    return camp::get<param_tup_sz - 1>(param_tup).get_lambda_arg_tup();
+  }
+  template <camp::idx_t N>
+  RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<N>)
+  {
+    return camp::tuple_cat_pair(
+        camp::get<param_tup_sz - N>(param_tup).get_lambda_arg_tup(),
+        LAMBDA_ARG_TUP_V(camp::num<N - 1>()));
   }
 
-  
+public:
+  ForallParamPack() {}
 
-  namespace detail {
-    // Maybe we should do a lot of these with structs...
-    template<camp::idx_t... Seq, typename TupleType>
-    constexpr auto tuple_from_seq (const camp::idx_seq<Seq...>&, TupleType&& tuple){
-      return camp::forward_as_tuple( camp::get< Seq >(std::forward<TupleType>(tuple))... );
-    };
+  RAJA_HOST_DEVICE constexpr lambda_arg_tuple_t lambda_args()
+  {
+    return LAMBDA_ARG_TUP_V(camp::num<sizeof...(Params)>());
+  }
 
-    template<typename... Ts>
-    constexpr auto strip_last_elem(camp::tuple<Ts...>&& tuple){
-      return tuple_from_seq(camp::make_idx_seq_t<sizeof...(Ts)-1>{},std::move(tuple));
-    };
-  } // namespace detail
+  using lambda_arg_seq =
+      camp::make_idx_seq_t<camp::tuple_size<lambda_arg_tuple_t>::value>;
 
+  template <typename... Ts>
+  ForallParamPack(camp::tuple<Ts...>&& t) : param_tup(std::move(t)) {};
+};  // struct ForallParamPack
 
-  // Make a tuple of the param pack except the final element...
-  template<typename... Args>
-  constexpr auto make_forall_param_pack(Args&&... args){
-    // We assume the last element of the pack is the lambda so we need to strip it from the list.
-    auto stripped_arg_tuple = detail::strip_last_elem( camp::forward_as_tuple(std::forward<Args>(args)...) ); 
-    return make_forall_param_pack_from_tuple(std::move(stripped_arg_tuple));
+
+//===========================================================================
+//
+//
+// ParamMultiplexer is how we hook into the individual calls within forall_impl.
+//
+//
+struct ParamMultiplexer
+{
+  template <typename EXEC_POL,
+            typename... Params,
+            typename... Args,
+            typename FP = ForallParamPack<Params...>>
+  static void constexpr init(ForallParamPack<Params...>& f_params,
+                             Args&&... args)
+  {
+    FP::detail_init(EXEC_POL(), typename FP::params_seq(), f_params,
+                    std::forward<Args>(args)...);
+  }
+  template <typename EXEC_POL,
+            typename... Params,
+            typename... Args,
+            typename FP = ForallParamPack<Params...>>
+  static void constexpr combine(ForallParamPack<Params...>& f_params,
+                                Args&&... args)
+  {
+    FP::detail_combine(EXEC_POL(), typename FP::params_seq(), f_params,
+                       std::forward<Args>(args)...);
+  }
+  template <typename EXEC_POL,
+            typename... Params,
+            typename... Args,
+            typename FP = ForallParamPack<Params...>>
+  static void constexpr resolve(ForallParamPack<Params...>& f_params,
+                                Args&&... args)
+  {
+    FP::detail_resolve(EXEC_POL(), typename FP::params_seq(), f_params,
+                       std::forward<Args>(args)...);
   }
-  //===========================================================================
-
-
-
-  //===========================================================================
-  //
-  //
-  // Callable should be the last argument in the param pack, just extract it...
-  //
-  //
-  template<typename... Args>
-  constexpr auto&& get_lambda(Args&&... args){
-    return camp::get<sizeof...(Args)-1>( camp::forward_as_tuple(std::forward<Args>(args)...) );
-  } 
-  //===========================================================================
-
-
-
-  //===========================================================================
-  //
-  //
-  // Checking expected argument list against the assumed lambda.
-  //
-  //
-  namespace detail {
-
-    // 
-    //
-    // Lambda traits Utilities
-    // 
-    //
-    template<class F>
-    struct lambda_traits;
-
-    template<class R, class C, class First, class... Rest>
-    struct lambda_traits<R (C::*)(First, Rest...)>
-    {  // non-const specialization
-      using arg_type = First; 
-    };
-    template<class R, class C, class First, class... Rest>
-    struct lambda_traits<R (C::*)(First, Rest...) const>
-    {  // const specialization
-      using arg_type = First; 
-    };
-
-    template<class T>
-    typename lambda_traits<T>::arg_type* lambda_arg_helper(T);
-
-
-    // 
-    //
-    // List manipulation Utilities
-    // 
-    //
-    template<typename... Ts>
-    constexpr auto list_remove_pointer(const camp::list<Ts...>&){
-      return camp::list<camp::decay<typename std::remove_pointer<Ts>::type>...>{};
-    }
-    
-    template<typename... Ts>
-    constexpr auto list_add_lvalue_ref(const camp::list<Ts...>&){
-      return camp::list<typename std::add_lvalue_reference<Ts>::type...>{};
-    }
-
-    template<typename... Ts>
-    constexpr auto tuple_to_list(const camp::tuple<Ts...>&) {
-      return camp::list<Ts...>{};
-    }
-
-    // TODO : Change to std::is_invocable at c++17
-    template <typename F, typename... Args>
-    struct is_invocable :
-      std::is_constructible<
-        std::function<void(Args ...)>,
-        std::reference_wrapper<typename std::remove_reference<F>::type>
-      >{};
-
-    template<class...>
-    using void_t = void;
-
-    template<class F, class=void>
-    struct has_empty_op : std::false_type{};
-
-    template<class F>
-    struct has_empty_op<F, void_t<decltype(std::declval<F::operator()>)>> : std::true_type{};
-
-    template<class F>
-    struct get_lambda_index_type {
-      typedef typename std::remove_pointer<
-                decltype(lambda_arg_helper(
-                      &camp::decay<F>::operator())
-                )
-              >::type type;
-    };
-
-    // If LAMBDA::operator() is not available this probably isn't a generic lambda and we can't extract and check args.
-    template<typename LAMBDA, typename... EXPECTED_ARGS>
-    constexpr concepts::enable_if<concepts::negate<has_empty_op<LAMBDA>>> check_invocable(LAMBDA&&, const camp::list<EXPECTED_ARGS...>&) {}
-
-    template<typename LAMBDA, typename... EXPECTED_ARGS>
-    constexpr concepts::enable_if<has_empty_op<LAMBDA>> check_invocable(LAMBDA&&, const camp::list<EXPECTED_ARGS...>&) {
+};
+//===========================================================================
+
+
+//===========================================================================
+//
+//
+// ForallParamPack generators.
+//
+//
+RAJA_INLINE static auto get_empty_forall_param_pack()
+{
+  static ForallParamPack<> p;
+  return p;
+}
+
+namespace detail
+{
+// all_true trick to perform variadic expansion in static asserts.
+// https://stackoverflow.com/questions/36933176/how-do-you-static-assert-the-values-in-a-parameter-pack-of-a-variadic-template
+template <bool...>
+struct bool_pack;
+template <bool... bs>
+using all_true = std::is_same<bool_pack<bs..., true>, bool_pack<true, bs...>>;
+
+template <typename Base, typename... Ts>
+using check_types_derive_base =
+    all_true<std::is_convertible<Ts, Base>::value...>;
+}  // namespace detail
+
+
+template <typename... Ts>
+constexpr auto make_forall_param_pack_from_tuple(camp::tuple<Ts...>&& tuple)
+{
+  static_assert(detail::check_types_derive_base<detail::ForallParamBase,
+                                                camp::decay<Ts>...>::value,
+                "Forall optional arguments do not derive ForallParamBase. "
+                "Please see Reducer, ReducerLoc and KernelName for examples.");
+  return ForallParamPack<camp::decay<Ts>...>(std::move(tuple));
+}
+
+
+namespace detail
+{
+// Maybe we should do a lot of these with structs...
+template <camp::idx_t... Seq, typename TupleType>
+constexpr auto tuple_from_seq(const camp::idx_seq<Seq...>&, TupleType&& tuple)
+{
+  return camp::forward_as_tuple(
+      camp::get<Seq>(std::forward<TupleType>(tuple))...);
+};
+
+template <typename... Ts>
+constexpr auto strip_last_elem(camp::tuple<Ts...>&& tuple)
+{
+  return tuple_from_seq(camp::make_idx_seq_t<sizeof...(Ts) - 1> {},
+                        std::move(tuple));
+};
+}  // namespace detail
+
+
+// Make a tuple of the param pack except the final element...
+template <typename... Args>
+constexpr auto make_forall_param_pack(Args&&... args)
+{
+  // We assume the last element of the pack is the lambda so we need to strip it
+  // from the list.
+  auto stripped_arg_tuple = detail::strip_last_elem(
+      camp::forward_as_tuple(std::forward<Args>(args)...));
+  return make_forall_param_pack_from_tuple(std::move(stripped_arg_tuple));
+}
+//===========================================================================
+
+
+//===========================================================================
+//
+//
+// Callable should be the last argument in the param pack, just extract it...
+//
+//
+template <typename... Args>
+constexpr auto&& get_lambda(Args&&... args)
+{
+  return camp::get<sizeof...(Args) - 1>(
+      camp::forward_as_tuple(std::forward<Args>(args)...));
+}
+//===========================================================================
+
+
+//===========================================================================
+//
+//
+// Checking expected argument list against the assumed lambda.
+//
+//
+namespace detail
+{
+
+//
+//
+// Lambda traits Utilities
+//
+//
+template <class F>
+struct lambda_traits;
+
+template <class R, class C, class First, class... Rest>
+struct lambda_traits<R (C::*)(First, Rest...)>
+{  // non-const specialization
+  using arg_type = First;
+};
+template <class R, class C, class First, class... Rest>
+struct lambda_traits<R (C::*)(First, Rest...) const>
+{  // const specialization
+  using arg_type = First;
+};
+
+template <class T>
+typename lambda_traits<T>::arg_type* lambda_arg_helper(T);
+
+
+//
+//
+// List manipulation Utilities
+//
+//
+template <typename... Ts>
+constexpr auto list_remove_pointer(const camp::list<Ts...>&)
+{
+  return camp::list<camp::decay<typename std::remove_pointer<Ts>::type>...> {};
+}
+
+template <typename... Ts>
+constexpr auto list_add_lvalue_ref(const camp::list<Ts...>&)
+{
+  return camp::list<typename std::add_lvalue_reference<Ts>::type...> {};
+}
+
+template <typename... Ts>
+constexpr auto tuple_to_list(const camp::tuple<Ts...>&)
+{
+  return camp::list<Ts...> {};
+}
+
+// TODO : Change to std::is_invocable at c++17
+template <typename F, typename... Args>
+struct is_invocable
+    : std::is_constructible<
+          std::function<void(Args...)>,
+          std::reference_wrapper<typename std::remove_reference<F>::type>>
+{};
+
+template <class...>
+using void_t = void;
+
+template <class F, class = void>
+struct has_empty_op : std::false_type
+{};
+
+template <class F>
+struct has_empty_op<F, void_t<decltype(std::declval<F::operator()>)>>
+    : std::true_type
+{};
+
+template <class F>
+struct get_lambda_index_type
+{
+  typedef typename std::remove_pointer<decltype(lambda_arg_helper(
+      &camp::decay<F>::operator()))>::type type;
+};
+
+// If LAMBDA::operator() is not available this probably isn't a generic lambda
+// and we can't extract and check args.
+template <typename LAMBDA, typename... EXPECTED_ARGS>
+constexpr concepts::enable_if<concepts::negate<has_empty_op<LAMBDA>>>
+check_invocable(LAMBDA&&, const camp::list<EXPECTED_ARGS...>&)
+{}
+
+template <typename LAMBDA, typename... EXPECTED_ARGS>
+constexpr concepts::enable_if<has_empty_op<LAMBDA>>
+check_invocable(LAMBDA&&, const camp::list<EXPECTED_ARGS...>&)
+{
 #if !defined(RAJA_ENABLE_HIP)
-      static_assert(is_invocable<LAMBDA, typename get_lambda_index_type<LAMBDA>::type, EXPECTED_ARGS...>::value, "LAMBDA Not invocable w/ EXPECTED_ARGS. Ordering and types must match between RAJA::expt::Reduce() and ValOp arguments."); 
+  static_assert(
+      is_invocable<LAMBDA, typename get_lambda_index_type<LAMBDA>::type,
+                   EXPECTED_ARGS...>::value,
+      "LAMBDA Not invocable w/ EXPECTED_ARGS. Ordering and types must match "
+      "between RAJA::expt::Reduce() and ValOp arguments.");
 #endif
-    }
-
-  } // namespace detail
+}
 
+}  // namespace detail
 
-  template<typename Lambda, typename ForallParams>
-  constexpr 
-  void
-  check_forall_optional_args(Lambda&& l, ForallParams& fpp) {
 
-    using expected_arg_type_list = decltype( detail::list_add_lvalue_ref(
-                                               detail::list_remove_pointer(
-                                                 detail::tuple_to_list(
-                                                   fpp.lambda_args()
-                                                 )
-                                               )
-                                            ));
+template <typename Lambda, typename ForallParams>
+constexpr void check_forall_optional_args(Lambda&& l, ForallParams& fpp)
+{
 
-    detail::check_invocable(std::forward<Lambda>(l), expected_arg_type_list{});
-  }
-  //===========================================================================
-  
+  using expected_arg_type_list = decltype(detail::list_add_lvalue_ref(
+      detail::list_remove_pointer(detail::tuple_to_list(fpp.lambda_args()))));
 
+  detail::check_invocable(std::forward<Lambda>(l), expected_arg_type_list {});
+}
+//===========================================================================
 
-  //===========================================================================
-  //
-  //
-  // Type trailts for SFINAE work.
-  //
-  //
-  namespace type_traits
-  {
-    template <typename T> struct is_ForallParamPack : std::false_type {};
-    template <typename... Args> struct is_ForallParamPack<ForallParamPack<Args...>> : std::true_type {};
 
-    template <typename T> struct is_ForallParamPack_empty : std::true_type {};
-    template <typename First, typename... Rest> struct is_ForallParamPack_empty<ForallParamPack<First, Rest...>> : std::false_type {};
-    template <> struct is_ForallParamPack_empty<ForallParamPack<>> : std::true_type {};
-  }
-  //===========================================================================
-
-
-
-  //===========================================================================
-  //
-  //
-  // Invoke Forall with Params.
-  //
-  //
-  namespace detail {
-    template<camp::idx_t Idx, typename FP>
-    RAJA_HOST_DEVICE
-    constexpr
-    auto get_lambda_args(FP& fpp)
-        -> decltype(  *camp::get<Idx>( fpp.lambda_args() )  ) {
-      return (  *camp::get<Idx>( fpp.lambda_args() )  );
-    }
-
-    CAMP_SUPPRESS_HD_WARN
-    template <typename Fn,
-              camp::idx_t... Sequence,
-              typename Params,
-              typename... Ts>
-    RAJA_HOST_DEVICE constexpr auto invoke_with_order(Params&& params,
-                                                      Fn&& f,
-                                                      camp::idx_seq<Sequence...>,
-                                                      Ts&&... extra)
-    {
-      return f(std::forward<Ts...>(extra...), ( get_lambda_args<Sequence>(params) )...);
-    }
-  } // namespace detail
-
-  //CAMP_SUPPRESS_HD_WARN
-  template <typename Params, typename Fn, typename... Ts>
-  RAJA_HOST_DEVICE constexpr auto invoke_body(Params&& params, Fn&& f, Ts&&... extra)
-  {
-    return detail::invoke_with_order(
-        camp::forward<Params>(params),
-        camp::forward<Fn>(f),
-        typename camp::decay<Params>::lambda_arg_seq(),
-        camp::forward<Ts...>(extra)...);
-  }
-  //===========================================================================
+//===========================================================================
+//
+//
+// Type trailts for SFINAE work.
+//
+//
+namespace type_traits
+{
+template <typename T>
+struct is_ForallParamPack : std::false_type
+{};
+template <typename... Args>
+struct is_ForallParamPack<ForallParamPack<Args...>> : std::true_type
+{};
+
+template <typename T>
+struct is_ForallParamPack_empty : std::true_type
+{};
+template <typename First, typename... Rest>
+struct is_ForallParamPack_empty<ForallParamPack<First, Rest...>>
+    : std::false_type
+{};
+template <>
+struct is_ForallParamPack_empty<ForallParamPack<>> : std::true_type
+{};
+}  // namespace type_traits
+//===========================================================================
+
+
+//===========================================================================
+//
+//
+// Invoke Forall with Params.
+//
+//
+namespace detail
+{
+template <camp::idx_t Idx, typename FP>
+RAJA_HOST_DEVICE constexpr auto get_lambda_args(FP& fpp)
+    -> decltype(*camp::get<Idx>(fpp.lambda_args()))
+{
+  return (*camp::get<Idx>(fpp.lambda_args()));
+}
+
+CAMP_SUPPRESS_HD_WARN
+template <typename Fn, camp::idx_t... Sequence, typename Params, typename... Ts>
+RAJA_HOST_DEVICE constexpr auto invoke_with_order(Params&& params,
+                                                  Fn&& f,
+                                                  camp::idx_seq<Sequence...>,
+                                                  Ts&&... extra)
+{
+  return f(std::forward<Ts...>(extra...),
+           (get_lambda_args<Sequence>(params))...);
+}
+}  // namespace detail
+
+// CAMP_SUPPRESS_HD_WARN
+template <typename Params, typename Fn, typename... Ts>
+RAJA_HOST_DEVICE constexpr auto
+invoke_body(Params&& params, Fn&& f, Ts&&... extra)
+{
+  return detail::invoke_with_order(
+      camp::forward<Params>(params), camp::forward<Fn>(f),
+      typename camp::decay<Params>::lambda_arg_seq(),
+      camp::forward<Ts...>(extra)...);
+}
+//===========================================================================
 
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace expt
+}  //  namespace RAJA
 
-#endif //  FORALL_PARAM_HPP
+#endif  //  FORALL_PARAM_HPP
diff --git a/include/RAJA/pattern/params/kernel_name.hpp b/include/RAJA/pattern/params/kernel_name.hpp
index e768d8dd59..e9d2a6e3e1 100644
--- a/include/RAJA/pattern/params/kernel_name.hpp
+++ b/include/RAJA/pattern/params/kernel_name.hpp
@@ -10,23 +10,20 @@ namespace expt
 namespace detail
 {
 
-  struct KernelName : public ForallParamBase {
-    RAJA_HOST_DEVICE KernelName() {}
-    KernelName(const char* name_in) : name(name_in) {}
-    const char* name;
-  };
-
-} // namespace detail
-
-inline auto KernelName(const char * n)
+struct KernelName : public ForallParamBase
 {
-  return detail::KernelName(n);
-}
-} // namespace expt
+  RAJA_HOST_DEVICE KernelName() {}
+  KernelName(const char* name_in) : name(name_in) {}
+  const char* name;
+};
+
+}  // namespace detail
 
+inline auto KernelName(const char* n) { return detail::KernelName(n); }
+}  // namespace expt
 
-} //  namespace RAJA
 
+}  //  namespace RAJA
 
 
-#endif // KERNEL_NAME_HPP
+#endif  // KERNEL_NAME_HPP
diff --git a/include/RAJA/pattern/params/params_base.hpp b/include/RAJA/pattern/params/params_base.hpp
index 98380f6ffc..7347dc521d 100644
--- a/include/RAJA/pattern/params/params_base.hpp
+++ b/include/RAJA/pattern/params/params_base.hpp
@@ -7,129 +7,259 @@ namespace RAJA
 namespace expt
 {
 
-  template<typename T, typename IndexType = RAJA::Index_type>
-  struct ValLoc {
-    using index_type = IndexType;
-    using value_type = T;
-
-    ValLoc() = default;
-    RAJA_HOST_DEVICE constexpr explicit ValLoc(value_type v) : val(v) {}
-    RAJA_HOST_DEVICE constexpr ValLoc(value_type v, index_type l) : val(v), loc(l) {}
-
-    ValLoc(ValLoc const &) = default;
-    ValLoc(ValLoc &&) = default;
-    ValLoc& operator=(ValLoc const &) = default;
-    ValLoc& operator=(ValLoc &&) = default;
-
-    RAJA_HOST_DEVICE constexpr bool operator<(const ValLoc& rhs) const { return val < rhs.val; }
-    RAJA_HOST_DEVICE constexpr bool operator>(const ValLoc& rhs) const { return val > rhs.val; }
-
-    RAJA_HOST_DEVICE constexpr const value_type& getVal() const {return val;}
-    RAJA_HOST_DEVICE constexpr const index_type& getLoc() const {return loc;}
-
-    RAJA_HOST_DEVICE void set(T inval, IndexType inindex) {val = inval; loc = inindex;}
-    RAJA_HOST_DEVICE void setVal(T inval) {val = inval;}
-    RAJA_HOST_DEVICE void setLoc(IndexType inindex) {loc = inindex;}
-
-    value_type val;
-    index_type loc = -1;
-  };
-
-  template<typename T, template <typename, typename, typename> class Op>
-  struct ValOp {
-    using value_type = T;
-    using op_type = Op<T,T,T>;
-
-    ValOp() = default;
-    RAJA_HOST_DEVICE constexpr explicit ValOp(value_type v) : val(v) {}
-
-    ValOp(ValOp const &) = default;
-    ValOp(ValOp &&) = default;
-    ValOp& operator=(ValOp const &) = default;
-    ValOp& operator=(ValOp &&) = default;
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::minimum<T,T,T>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & min(value_type v) { if (v < val) { val = v; } return *this; }
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::maximum<T,T,T>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & max(value_type v) { if (v > val) { val = v; } return *this; }
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::plus<T,T,T>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & operator+=(const value_type& rhs) { val += rhs; return *this; }
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::bit_and<T,T,T>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & operator&=(const value_type& rhs) { val &= rhs; return *this; }
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::bit_or<T,T,T>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & operator|=(const value_type& rhs) { val |= rhs; return *this; }
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::bit_and<T,T,T>>::value> * = nullptr>
-    RAJA_HOST_DEVICE ValOp & operator&=(value_type& rhs) { val &= rhs; return *this; }
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::bit_or<T,T,T>>::value> * = nullptr>
-    RAJA_HOST_DEVICE ValOp & operator|=(value_type& rhs) { val |= rhs; return *this; }
-
-    RAJA_HOST_DEVICE constexpr bool operator<(const ValOp& rhs) const { val < rhs.val; return *this; }
-    RAJA_HOST_DEVICE constexpr bool operator>(const ValOp& rhs) const { val > rhs.val; return *this; }
-
-    value_type val = op_type::identity();
-  };
-
-  template<typename T, typename IndexType, template <typename, typename, typename> class Op>
-  struct ValOp <ValLoc<T,IndexType>, Op> {
-    using index_type = IndexType;
-    using value_type = ValLoc<T,index_type>;
-    using op_type = Op<value_type,value_type,value_type>;
-    using valloc_value_type = typename value_type::value_type;
-    using valloc_index_type = typename value_type::index_type;
-
-    ValOp() = default;
-    RAJA_HOST_DEVICE constexpr explicit ValOp(value_type v) : val(v) {}
-    RAJA_HOST_DEVICE constexpr ValOp(valloc_value_type v, valloc_index_type l) : val(v, l) {}
-
-    ValOp(ValOp const &) = default;
-    ValOp(ValOp &&) = default;
-    ValOp& operator=(ValOp const &) = default;
-    ValOp& operator=(ValOp &&) = default;
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::minimum<value_type,value_type,value_type>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & min(value_type v) { if (v < val) { val = v; } return *this; }
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::maximum<value_type,value_type,value_type>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & max(value_type v) { if (v > val) { val = v; } return *this; }
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::minimum<value_type,value_type,value_type>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & minloc(valloc_value_type v, valloc_index_type l) { return min(value_type(v,l)); }
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::maximum<value_type,value_type,value_type>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & maxloc(valloc_value_type v, valloc_index_type l) { return max(value_type(v,l)); }
-
-    RAJA_HOST_DEVICE constexpr bool operator<(const ValOp& rhs) const { return val < rhs.val; }
-    RAJA_HOST_DEVICE constexpr bool operator>(const ValOp& rhs) const { return val > rhs.val; }
-
-    value_type val = op_type::identity();
-  };
-
-  template<typename T, typename IndexType, template <typename, typename, typename> class Op>
-  using ValLocOp = ValOp<ValLoc<T, IndexType>, Op>;
+template <typename T, typename IndexType = RAJA::Index_type>
+struct ValLoc
+{
+  using index_type = IndexType;
+  using value_type = T;
+
+  ValLoc() = default;
+  RAJA_HOST_DEVICE constexpr explicit ValLoc(value_type v) : val(v) {}
+  RAJA_HOST_DEVICE constexpr ValLoc(value_type v, index_type l) : val(v), loc(l)
+  {}
+
+  ValLoc(ValLoc const&)            = default;
+  ValLoc(ValLoc&&)                 = default;
+  ValLoc& operator=(ValLoc const&) = default;
+  ValLoc& operator=(ValLoc&&)      = default;
+
+  RAJA_HOST_DEVICE constexpr bool operator<(const ValLoc& rhs) const
+  {
+    return val < rhs.val;
+  }
+  RAJA_HOST_DEVICE constexpr bool operator>(const ValLoc& rhs) const
+  {
+    return val > rhs.val;
+  }
+
+  RAJA_HOST_DEVICE constexpr const value_type& getVal() const { return val; }
+  RAJA_HOST_DEVICE constexpr const index_type& getLoc() const { return loc; }
+
+  RAJA_HOST_DEVICE void set(T inval, IndexType inindex)
+  {
+    val = inval;
+    loc = inindex;
+  }
+  RAJA_HOST_DEVICE void setVal(T inval) { val = inval; }
+  RAJA_HOST_DEVICE void setLoc(IndexType inindex) { loc = inindex; }
+
+  value_type val;
+  index_type loc = -1;
+};
+
+template <typename T, template <typename, typename, typename> class Op>
+struct ValOp
+{
+  using value_type = T;
+  using op_type    = Op<T, T, T>;
+
+  ValOp() = default;
+  RAJA_HOST_DEVICE constexpr explicit ValOp(value_type v) : val(v) {}
+
+  ValOp(ValOp const&)            = default;
+  ValOp(ValOp&&)                 = default;
+  ValOp& operator=(ValOp const&) = default;
+  ValOp& operator=(ValOp&&)      = default;
+
+  template <
+      typename U = op_type,
+      std::enable_if_t<
+          std::is_same<U, RAJA::operators::minimum<T, T, T>>::value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& min(value_type v)
+  {
+    if (v < val)
+    {
+      val = v;
+    }
+    return *this;
+  }
+  template <
+      typename U = op_type,
+      std::enable_if_t<
+          std::is_same<U, RAJA::operators::maximum<T, T, T>>::value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& max(value_type v)
+  {
+    if (v > val)
+    {
+      val = v;
+    }
+    return *this;
+  }
+
+  template <
+      typename U = op_type,
+      std::enable_if_t<
+          std::is_same<U, RAJA::operators::plus<T, T, T>>::value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& operator+=(const value_type& rhs)
+  {
+    val += rhs;
+    return *this;
+  }
+
+  template <
+      typename U = op_type,
+      std::enable_if_t<
+          std::is_same<U, RAJA::operators::bit_and<T, T, T>>::value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& operator&=(const value_type& rhs)
+  {
+    val &= rhs;
+    return *this;
+  }
+
+  template <
+      typename U = op_type,
+      std::enable_if_t<
+          std::is_same<U, RAJA::operators::bit_or<T, T, T>>::value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& operator|=(const value_type& rhs)
+  {
+    val |= rhs;
+    return *this;
+  }
+
+  template <
+      typename U = op_type,
+      std::enable_if_t<
+          std::is_same<U, RAJA::operators::bit_and<T, T, T>>::value>* = nullptr>
+  RAJA_HOST_DEVICE ValOp& operator&=(value_type& rhs)
+  {
+    val &= rhs;
+    return *this;
+  }
+
+  template <
+      typename U = op_type,
+      std::enable_if_t<
+          std::is_same<U, RAJA::operators::bit_or<T, T, T>>::value>* = nullptr>
+  RAJA_HOST_DEVICE ValOp& operator|=(value_type& rhs)
+  {
+    val |= rhs;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE constexpr bool operator<(const ValOp& rhs) const
+  {
+    val < rhs.val;
+    return *this;
+  }
+  RAJA_HOST_DEVICE constexpr bool operator>(const ValOp& rhs) const
+  {
+    val > rhs.val;
+    return *this;
+  }
+
+  value_type val = op_type::identity();
+};
+
+template <typename T,
+          typename IndexType,
+          template <typename, typename, typename>
+          class Op>
+struct ValOp<ValLoc<T, IndexType>, Op>
+{
+  using index_type        = IndexType;
+  using value_type        = ValLoc<T, index_type>;
+  using op_type           = Op<value_type, value_type, value_type>;
+  using valloc_value_type = typename value_type::value_type;
+  using valloc_index_type = typename value_type::index_type;
+
+  ValOp() = default;
+  RAJA_HOST_DEVICE constexpr explicit ValOp(value_type v) : val(v) {}
+  RAJA_HOST_DEVICE constexpr ValOp(valloc_value_type v, valloc_index_type l)
+      : val(v, l)
+  {}
+
+  ValOp(ValOp const&)            = default;
+  ValOp(ValOp&&)                 = default;
+  ValOp& operator=(ValOp const&) = default;
+  ValOp& operator=(ValOp&&)      = default;
+
+  template <typename U                   = op_type,
+            std::enable_if_t<std::is_same<
+                U,
+                RAJA::operators::minimum<value_type, value_type, value_type>>::
+                                 value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& min(value_type v)
+  {
+    if (v < val)
+    {
+      val = v;
+    }
+    return *this;
+  }
+
+  template <typename U                   = op_type,
+            std::enable_if_t<std::is_same<
+                U,
+                RAJA::operators::maximum<value_type, value_type, value_type>>::
+                                 value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& max(value_type v)
+  {
+    if (v > val)
+    {
+      val = v;
+    }
+    return *this;
+  }
+
+  template <typename U                   = op_type,
+            std::enable_if_t<std::is_same<
+                U,
+                RAJA::operators::minimum<value_type, value_type, value_type>>::
+                                 value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& minloc(valloc_value_type v,
+                                           valloc_index_type l)
+  {
+    return min(value_type(v, l));
+  }
+
+  template <typename U                   = op_type,
+            std::enable_if_t<std::is_same<
+                U,
+                RAJA::operators::maximum<value_type, value_type, value_type>>::
+                                 value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& maxloc(valloc_value_type v,
+                                           valloc_index_type l)
+  {
+    return max(value_type(v, l));
+  }
+
+  RAJA_HOST_DEVICE constexpr bool operator<(const ValOp& rhs) const
+  {
+    return val < rhs.val;
+  }
+  RAJA_HOST_DEVICE constexpr bool operator>(const ValOp& rhs) const
+  {
+    return val > rhs.val;
+  }
+
+  value_type val = op_type::identity();
+};
+
+template <typename T,
+          typename IndexType,
+          template <typename, typename, typename>
+          class Op>
+using ValLocOp = ValOp<ValLoc<T, IndexType>, Op>;
 
 namespace detail
 {
 
-  struct ForallParamBase {
+struct ForallParamBase
+{
 
-    // Some of this can be made virtual in c++20, for now must be defined in each child class
-    // if any arguments to the forall lambda are needed (e.g. KernelName is excluded.)
-    using ARG_TUP_T = camp::tuple<>; 
-    using ARG_LIST_T = typename ARG_TUP_T::TList;
-    RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(); }
-    static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value;
-  
-  };
+  // Some of this can be made virtual in c++20, for now must be defined in each
+  // child class if any arguments to the forall lambda are needed (e.g.
+  // KernelName is excluded.)
+  using ARG_TUP_T  = camp::tuple<>;
+  using ARG_LIST_T = typename ARG_TUP_T::TList;
+  RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(); }
+  static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value;
+};
 
-} // namespace detail
+}  // namespace detail
 
-} // namespace expt
+}  // namespace expt
 
-} //  namespace RAJA
+}  //  namespace RAJA
 
-#endif //  RAJA_PARAMS_BASE
+#endif  //  RAJA_PARAMS_BASE
diff --git a/include/RAJA/pattern/params/reducer.hpp b/include/RAJA/pattern/params/reducer.hpp
index 78b6d7714d..ee4ac7c7f7 100644
--- a/include/RAJA/pattern/params/reducer.hpp
+++ b/include/RAJA/pattern/params/reducer.hpp
@@ -19,20 +19,23 @@ namespace operators
 {
 
 template <typename T, typename IndexType>
-struct limits<RAJA::expt::ValLoc<T, IndexType>> {
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc<T, IndexType> min()
+struct limits<RAJA::expt::ValLoc<T, IndexType>>
+{
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc<T, IndexType>
+  min()
   {
     return RAJA::expt::ValLoc<T, IndexType>(RAJA::operators::limits<T>::min());
   }
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc<T, IndexType> max()
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc<T, IndexType>
+  max()
   {
     return RAJA::expt::ValLoc<T, IndexType>(RAJA::operators::limits<T>::max());
   }
 };
 
-} //  namespace operators
+}  //  namespace operators
 
-} //  namespace RAJA
+}  //  namespace RAJA
 
 namespace RAJA
 {
@@ -43,159 +46,196 @@ namespace detail
 {
 
 #if defined(RAJA_CUDA_ACTIVE)
-  using device_mem_pool_t = RAJA::cuda::device_mempool_type;
+using device_mem_pool_t = RAJA::cuda::device_mempool_type;
 #elif defined(RAJA_HIP_ACTIVE)
-  using device_mem_pool_t = RAJA::hip::device_mempool_type;
+using device_mem_pool_t = RAJA::hip::device_mempool_type;
 #elif defined(RAJA_SYCL_ACTIVE)
-  using device_mem_pool_t = RAJA::sycl::device_mempool_type;
+using device_mem_pool_t = RAJA::sycl::device_mempool_type;
 #endif
 
-  //
-  //
-  // Basic Reducer
-  //
-  //
-
-  // Basic data type Reducer
-  // T must be a basic data type
-  // VOp must be ValOp<T, Op>
-  template <typename Op, typename T, typename VOp>
-  struct Reducer : public ForallParamBase {
-    using op = Op;
-    using value_type = T; // This is a basic data type
-
-    Reducer() = default;
-
-    // Basic data type constructor
-    RAJA_HOST_DEVICE Reducer(value_type *target_in) : m_valop(VOp{}), target(target_in){}
-
-    Reducer(Reducer const &) = default;
-    Reducer(Reducer &&) = default;
-    Reducer& operator=(Reducer const &) = default;
-    Reducer& operator=(Reducer &&) = default;
-
-    // Internal ValOp object that is used within RAJA::forall/launch
-    VOp m_valop = VOp{};
-
-    // Points to the user specified result variable
-    value_type *target = nullptr;
-
-    // combineTarget() performs the final op on the target data and location in resolve()
-    RAJA_HOST_DEVICE void combineTarget(value_type in)
-    {
-      value_type temp = op{}(*target, in);
-      *target = temp;
-    }
-
-    RAJA_HOST_DEVICE
-    value_type &
-    getVal() { return m_valop.val; }
-
-#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) || defined(RAJA_SYCL_ACTIVE)
-    // Device related attributes.
-    value_type * devicetarget = nullptr;
-    RAJA::detail::SoAPtr<value_type, device_mem_pool_t> device_mem;
-    unsigned int * device_count = nullptr;
+//
+//
+// Basic Reducer
+//
+//
+
+// Basic data type Reducer
+// T must be a basic data type
+// VOp must be ValOp<T, Op>
+template <typename Op, typename T, typename VOp>
+struct Reducer : public ForallParamBase
+{
+  using op         = Op;
+  using value_type = T;  // This is a basic data type
+
+  Reducer() = default;
+
+  // Basic data type constructor
+  RAJA_HOST_DEVICE Reducer(value_type* target_in)
+      : m_valop(VOp {}), target(target_in)
+  {}
+
+  Reducer(Reducer const&)            = default;
+  Reducer(Reducer&&)                 = default;
+  Reducer& operator=(Reducer const&) = default;
+  Reducer& operator=(Reducer&&)      = default;
+
+  // Internal ValOp object that is used within RAJA::forall/launch
+  VOp m_valop = VOp {};
+
+  // Points to the user specified result variable
+  value_type* target = nullptr;
+
+  // combineTarget() performs the final op on the target data and location in
+  // resolve()
+  RAJA_HOST_DEVICE void combineTarget(value_type in)
+  {
+    value_type temp = op {}(*target, in);
+    *target         = temp;
+  }
+
+  RAJA_HOST_DEVICE
+  value_type& getVal() { return m_valop.val; }
+
+#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) ||                   \
+    defined(RAJA_SYCL_ACTIVE)
+  // Device related attributes.
+  value_type* devicetarget = nullptr;
+  RAJA::detail::SoAPtr<value_type, device_mem_pool_t> device_mem;
+  unsigned int* device_count = nullptr;
 #endif
 
-    // These are types and parameters extracted from this struct, and given to the forall.
-    using ARG_TUP_T = camp::tuple<VOp*>;
-    RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(&m_valop); }
-
-    using ARG_LIST_T = typename ARG_TUP_T::TList;
-    static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value ;
-  };
-
-  // Partial specialization of Reducer for ValLoc
-  // T is a deduced basic data type
-  // I is a deduced index type
-  template <typename T, typename I, template <typename, typename, typename> class Op>
-  struct Reducer<Op<ValLoc<T,I>, ValLoc<T,I>, ValLoc<T,I>>, ValLoc<T,I>, ValOp<ValLoc<T,I>, Op>> : public ForallParamBase {
-    using target_value_type = T;
-    using target_index_type = I;
-    using value_type = ValLoc<T,I>;
-    using op = Op<value_type,value_type,value_type>;
-    using VOp = ValOp<ValLoc<target_value_type,target_index_type>, Op>;
-
-    Reducer() = default;
-
-    // ValLoc constructor
-    // Note that the target_ variables point to the val and loc within the user defined target ValLoc
-    RAJA_HOST_DEVICE Reducer(value_type *target_in) : m_valop(VOp{}), target_value(&target_in->val), target_index(&target_in->loc) {}
-
-    // Dual input constructor for ReduceLoc<>(data, index) case
-    // The target_ variables point to vars defined by the user
-    RAJA_HOST_DEVICE Reducer(target_value_type *data_in, target_index_type *index_in) : m_valop(VOp{}), target_value(data_in), target_index(index_in) {}
-
-    Reducer(Reducer const &) = default;
-    Reducer(Reducer &&) = default;
-    Reducer& operator=(Reducer const &) = default;
-    Reducer& operator=(Reducer &&) = default;
-
-    // The ValLoc within m_valop is initialized with data and location values from either a ValLoc, or dual data and location values, passed into the constructor
-    VOp m_valop = VOp{};
-
-    // Points to either dual value and index defined by the user, or value and index within a ValLoc defined by the user
-    target_value_type *target_value = nullptr;
-    target_index_type *target_index = nullptr;
-
-    // combineTarget() performs the final op on the target data and location in resolve()
-    RAJA_HOST_DEVICE void combineTarget(value_type in)
-    {
-      // Create a different temp ValLoc solely for combining
-      value_type temp(*target_value, *target_index);
-      temp = op{}(temp, in);
-      *target_value = temp.val;
-      *target_index = temp.loc;
-    }
-
-    RAJA_HOST_DEVICE
-    value_type &
-    getVal() { return m_valop.val; }
-
-#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) || defined(RAJA_SYCL_ACTIVE)
-    // Device related attributes.
-    value_type * devicetarget = nullptr;
-    RAJA::detail::SoAPtr<value_type, device_mem_pool_t> device_mem;
-    unsigned int * device_count = nullptr;
+  // These are types and parameters extracted from this struct, and given to the
+  // forall.
+  using ARG_TUP_T = camp::tuple<VOp*>;
+  RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup()
+  {
+    return camp::make_tuple(&m_valop);
+  }
+
+  using ARG_LIST_T                        = typename ARG_TUP_T::TList;
+  static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value;
+};
+
+// Partial specialization of Reducer for ValLoc
+// T is a deduced basic data type
+// I is a deduced index type
+template <typename T,
+          typename I,
+          template <typename, typename, typename>
+          class Op>
+struct Reducer<Op<ValLoc<T, I>, ValLoc<T, I>, ValLoc<T, I>>,
+               ValLoc<T, I>,
+               ValOp<ValLoc<T, I>, Op>> : public ForallParamBase
+{
+  using target_value_type = T;
+  using target_index_type = I;
+  using value_type        = ValLoc<T, I>;
+  using op                = Op<value_type, value_type, value_type>;
+  using VOp = ValOp<ValLoc<target_value_type, target_index_type>, Op>;
+
+  Reducer() = default;
+
+  // ValLoc constructor
+  // Note that the target_ variables point to the val and loc within the user
+  // defined target ValLoc
+  RAJA_HOST_DEVICE Reducer(value_type* target_in)
+      : m_valop(VOp {}),
+        target_value(&target_in->val),
+        target_index(&target_in->loc)
+  {}
+
+  // Dual input constructor for ReduceLoc<>(data, index) case
+  // The target_ variables point to vars defined by the user
+  RAJA_HOST_DEVICE Reducer(target_value_type* data_in,
+                           target_index_type* index_in)
+      : m_valop(VOp {}), target_value(data_in), target_index(index_in)
+  {}
+
+  Reducer(Reducer const&)            = default;
+  Reducer(Reducer&&)                 = default;
+  Reducer& operator=(Reducer const&) = default;
+  Reducer& operator=(Reducer&&)      = default;
+
+  // The ValLoc within m_valop is initialized with data and location values from
+  // either a ValLoc, or dual data and location values, passed into the
+  // constructor
+  VOp m_valop = VOp {};
+
+  // Points to either dual value and index defined by the user, or value and
+  // index within a ValLoc defined by the user
+  target_value_type* target_value = nullptr;
+  target_index_type* target_index = nullptr;
+
+  // combineTarget() performs the final op on the target data and location in
+  // resolve()
+  RAJA_HOST_DEVICE void combineTarget(value_type in)
+  {
+    // Create a different temp ValLoc solely for combining
+    value_type temp(*target_value, *target_index);
+    temp          = op {}(temp, in);
+    *target_value = temp.val;
+    *target_index = temp.loc;
+  }
+
+  RAJA_HOST_DEVICE
+  value_type& getVal() { return m_valop.val; }
+
+#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) ||                   \
+    defined(RAJA_SYCL_ACTIVE)
+  // Device related attributes.
+  value_type* devicetarget = nullptr;
+  RAJA::detail::SoAPtr<value_type, device_mem_pool_t> device_mem;
+  unsigned int* device_count = nullptr;
 #endif
 
-    // These are types and parameters extracted from this struct, and given to the forall.
-    using ARG_TUP_T = camp::tuple<VOp*>;
-    RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(&m_valop); }
+  // These are types and parameters extracted from this struct, and given to the
+  // forall.
+  using ARG_TUP_T = camp::tuple<VOp*>;
+  RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup()
+  {
+    return camp::make_tuple(&m_valop);
+  }
 
-    using ARG_LIST_T = typename ARG_TUP_T::TList;
-    static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value ;
-  };
+  using ARG_LIST_T                        = typename ARG_TUP_T::TList;
+  static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value;
+};
 
-} // namespace detail
+}  // namespace detail
 
 // Standard use case.
 template <template <typename, typename, typename> class Op, typename T>
-auto constexpr Reduce(T *target)
+auto constexpr Reduce(T* target)
 {
-  return detail::Reducer<Op<T,T,T>, T, ValOp<T, Op>>(target);
+  return detail::Reducer<Op<T, T, T>, T, ValOp<T, Op>>(target);
 }
 
 // User-defined ValLoc case.
-template <template <typename, typename, typename> class Op, typename T, typename IndexType>
-auto constexpr Reduce(ValLoc<T, IndexType> *target)
+template <template <typename, typename, typename> class Op,
+          typename T,
+          typename IndexType>
+auto constexpr Reduce(ValLoc<T, IndexType>* target)
 {
-  using VL = ValLoc<T,IndexType>;
-  return detail::Reducer<Op<VL,VL,VL>, VL, ValOp<ValLoc<T, IndexType>, Op>>(target);
+  using VL = ValLoc<T, IndexType>;
+  return detail::Reducer<Op<VL, VL, VL>, VL, ValOp<ValLoc<T, IndexType>, Op>>(
+      target);
 }
 
-// Dual input use case where reduction value and location are separate, non-ValLoc types supplied by the user.
-template <template <typename, typename, typename> class Op, typename T, typename IndexType>
-auto constexpr ReduceLoc(T *target, IndexType *index)
+// Dual input use case where reduction value and location are separate,
+// non-ValLoc types supplied by the user.
+template <template <typename, typename, typename> class Op,
+          typename T,
+          typename IndexType>
+auto constexpr ReduceLoc(T* target, IndexType* index)
 {
-  using VL = ValLoc<T,IndexType>;
-  return detail::Reducer<Op<VL,VL,VL>, VL, ValOp<ValLoc<T, IndexType>, Op>>(target, index);
+  using VL = ValLoc<T, IndexType>;
+  return detail::Reducer<Op<VL, VL, VL>, VL, ValOp<ValLoc<T, IndexType>, Op>>(
+      target, index);
 }
 
-} // namespace expt
+}  // namespace expt
 
 
-} //  namespace RAJA
+}  //  namespace RAJA
 
-#endif //  NEW_REDUCE_HPP
+#endif  //  NEW_REDUCE_HPP
diff --git a/include/RAJA/pattern/reduce.hpp b/include/RAJA/pattern/reduce.hpp
index 0c0eaf3efb..a1cc15dceb 100644
--- a/include/RAJA/pattern/reduce.hpp
+++ b/include/RAJA/pattern/reduce.hpp
@@ -205,7 +205,7 @@ class ReduceSum;
  */
 template <typename REDUCE_POLICY_T, typename T>
 class ReduceBitOr;
- 
+
 
 /*!
  ******************************************************************************
@@ -231,7 +231,7 @@ class ReduceBitOr;
  */
 template <typename REDUCE_POLICY_T, typename T>
 class ReduceBitAnd;
-} //namespace RAJA
+}  // namespace RAJA
 
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/scan.hpp b/include/RAJA/pattern/scan.hpp
index 0f46ee0a22..baf4664062 100644
--- a/include/RAJA/pattern/scan.hpp
+++ b/include/RAJA/pattern/scan.hpp
@@ -46,20 +46,21 @@ inline namespace policy_by_value_interface
 *
 ******************************************************************************
 */
-template <typename ExecPolicy,
-          typename Res,
-          typename Container,
-          typename Function = operators::plus<RAJA::detail::ContainerVal<Container>>>
+template <
+    typename ExecPolicy,
+    typename Res,
+    typename Container,
+    typename Function = operators::plus<RAJA::detail::ContainerVal<Container>>>
 RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>,
-                      std::is_constructible<camp::resources::Resource, Res>,
-                      type_traits::is_range<Container>>
-inclusive_scan_inplace(ExecPolicy&& p,
-                       Res r,
-                       Container&& c,
-                       Function binop = Function{})
+    concepts::enable_if_t<resources::EventProxy<Res>,
+                          type_traits::is_execution_policy<ExecPolicy>,
+                          type_traits::is_resource<Res>,
+                          std::is_constructible<camp::resources::Resource, Res>,
+                          type_traits::is_range<Container>>
+    inclusive_scan_inplace(ExecPolicy&& p,
+                           Res r,
+                           Container&& c,
+                           Function binop = Function {})
 {
   using std::begin;
   using std::end;
@@ -68,32 +69,32 @@ inclusive_scan_inplace(ExecPolicy&& p,
                 "Function must model BinaryFunction");
   static_assert(type_traits::is_random_access_range<Container>::value,
                 "Container must model RandomAccessRange");
-  if (begin(c) == end(c)) {
+  if (begin(c) == end(c))
+  {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::inclusive_inplace(r, std::forward<ExecPolicy>(p),
-                                       begin(c), end(c), binop);
+  return impl::scan::inclusive_inplace(r, std::forward<ExecPolicy>(p), begin(c),
+                                       end(c), binop);
 }
 ///
-template <typename ExecPolicy,
-          typename Container,
-          typename Function = operators::plus<RAJA::detail::ContainerVal<Container>>,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<Container>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, Container>>>
+template <
+    typename ExecPolicy,
+    typename Container,
+    typename Function = operators::plus<RAJA::detail::ContainerVal<Container>>,
+    typename Res      = typename resources::get_resource<ExecPolicy>::type>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<Container>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, Container>>>
 inclusive_scan_inplace(ExecPolicy&& p,
                        Container&& c,
-                       Function binop = Function{})
+                       Function binop = Function {})
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::inclusive_scan_inplace(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<Container>(c),
-      binop);
+      std::forward<ExecPolicy>(p), r, std::forward<Container>(c), binop);
 }
 
 /*!
@@ -111,19 +112,19 @@ inclusive_scan_inplace(ExecPolicy&& p,
 template <typename ExecPolicy,
           typename Res,
           typename Container,
-          typename T = RAJA::detail::ContainerVal<Container>,
+          typename T        = RAJA::detail::ContainerVal<Container>,
           typename Function = operators::plus<T>>
 RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>,
-                      std::is_constructible<camp::resources::Resource, Res>,
-                      type_traits::is_range<Container>>
-exclusive_scan_inplace(ExecPolicy&& p,
-                       Res r,
-                       Container&& c,
-                       Function binop = Function{},
-                       T value = Function::identity())
+    concepts::enable_if_t<resources::EventProxy<Res>,
+                          type_traits::is_execution_policy<ExecPolicy>,
+                          type_traits::is_resource<Res>,
+                          std::is_constructible<camp::resources::Resource, Res>,
+                          type_traits::is_range<Container>>
+    exclusive_scan_inplace(ExecPolicy&& p,
+                           Res r,
+                           Container&& c,
+                           Function binop = Function {},
+                           T value        = Function::identity())
 {
   using std::begin;
   using std::end;
@@ -132,35 +133,33 @@ exclusive_scan_inplace(ExecPolicy&& p,
                 "Function must model BinaryFunction");
   static_assert(type_traits::is_random_access_range<Container>::value,
                 "Container must model RandomAccessRange");
-  if (begin(c) == end(c)) {
+  if (begin(c) == end(c))
+  {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::exclusive_inplace(r, std::forward<ExecPolicy>(p),
-                                       begin(c), end(c), binop, value);
+  return impl::scan::exclusive_inplace(r, std::forward<ExecPolicy>(p), begin(c),
+                                       end(c), binop, value);
 }
 ///
 template <typename ExecPolicy,
           typename Container,
-          typename T = RAJA::detail::ContainerVal<Container>,
+          typename T        = RAJA::detail::ContainerVal<Container>,
           typename Function = operators::plus<T>,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<Container>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, Container>>>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<Container>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, Container>>>
 exclusive_scan_inplace(ExecPolicy&& p,
                        Container&& c,
-                       Function binop = Function{},
-                       T value = Function::identity())
+                       Function binop = Function {},
+                       T value        = Function::identity())
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::exclusive_scan_inplace(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<Container>(c),
-      binop,
-      value);
+      std::forward<ExecPolicy>(p), r, std::forward<Container>(c), binop, value);
 }
 
 /*!
@@ -183,19 +182,20 @@ template <typename ExecPolicy,
           typename Res,
           typename InContainer,
           typename OutContainer,
-          typename Function = operators::plus<RAJA::detail::ContainerVal<InContainer>>>
+          typename Function =
+              operators::plus<RAJA::detail::ContainerVal<InContainer>>>
 RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>,
-                      std::is_constructible<camp::resources::Resource, Res>,
-                      type_traits::is_range<InContainer>,
-                      type_traits::is_range<OutContainer>>
-inclusive_scan(ExecPolicy&& p,
-               Res r,
-               InContainer&& in,
-               OutContainer&& out,
-               Function binop = Function{})
+    concepts::enable_if_t<resources::EventProxy<Res>,
+                          type_traits::is_execution_policy<ExecPolicy>,
+                          type_traits::is_resource<Res>,
+                          std::is_constructible<camp::resources::Resource, Res>,
+                          type_traits::is_range<InContainer>,
+                          type_traits::is_range<OutContainer>>
+    inclusive_scan(ExecPolicy&& p,
+                   Res r,
+                   InContainer&& in,
+                   OutContainer&& out,
+                   Function binop = Function {})
 {
   using std::begin;
   using std::end;
@@ -207,36 +207,36 @@ inclusive_scan(ExecPolicy&& p,
                 "InContainer must model RandomAccessRange");
   static_assert(type_traits::is_random_access_range<OutContainer>::value,
                 "OutContainer must model RandomAccessRange");
-  if (begin(in) == end(in)) {
+  if (begin(in) == end(in))
+  {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::inclusive(r, std::forward<ExecPolicy>(p),
-                               begin(in), end(in), begin(out), binop);
+  return impl::scan::inclusive(r, std::forward<ExecPolicy>(p), begin(in),
+                               end(in), begin(out), binop);
 }
 ///
 template <typename ExecPolicy,
           typename InContainer,
           typename OutContainer,
-          typename Function = operators::plus<RAJA::detail::ContainerVal<InContainer>>,
+          typename Function =
+              operators::plus<RAJA::detail::ContainerVal<InContainer>>,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<InContainer>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, InContainer>>,
-                      type_traits::is_range<OutContainer>>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<InContainer>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, InContainer>>,
+    type_traits::is_range<OutContainer>>
 inclusive_scan(ExecPolicy&& p,
                InContainer&& in,
                OutContainer&& out,
-               Function binop = Function{})
+               Function binop = Function {})
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::inclusive_scan(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<InContainer>(in),
-      std::forward<OutContainer>(out),
-      binop);
+      std::forward<ExecPolicy>(p), r, std::forward<InContainer>(in),
+      std::forward<OutContainer>(out), binop);
 }
 
 /*!
@@ -259,21 +259,21 @@ template <typename ExecPolicy,
           typename Res,
           typename InContainer,
           typename OutContainer,
-          typename T = RAJA::detail::ContainerVal<InContainer>,
+          typename T        = RAJA::detail::ContainerVal<InContainer>,
           typename Function = operators::plus<T>>
 RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>,
-                      std::is_constructible<camp::resources::Resource, Res>,
-                      type_traits::is_range<InContainer>,
-                      type_traits::is_range<OutContainer>>
-exclusive_scan(ExecPolicy&& p,
-               Res r,
-               InContainer&& in,
-               OutContainer&& out,
-               Function binop = Function{},
-               T value = Function::identity())
+    concepts::enable_if_t<resources::EventProxy<Res>,
+                          type_traits::is_execution_policy<ExecPolicy>,
+                          type_traits::is_resource<Res>,
+                          std::is_constructible<camp::resources::Resource, Res>,
+                          type_traits::is_range<InContainer>,
+                          type_traits::is_range<OutContainer>>
+    exclusive_scan(ExecPolicy&& p,
+                   Res r,
+                   InContainer&& in,
+                   OutContainer&& out,
+                   Function binop = Function {},
+                   T value        = Function::identity())
 {
   using std::begin;
   using std::end;
@@ -285,42 +285,40 @@ exclusive_scan(ExecPolicy&& p,
                 "InContainer must model RandomAccessRange");
   static_assert(type_traits::is_random_access_range<OutContainer>::value,
                 "OutContainer must model RandomAccessRange");
-  if (begin(in) == end(in)) {
+  if (begin(in) == end(in))
+  {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::exclusive(r, std::forward<ExecPolicy>(p),
-                               begin(in), end(in), begin(out), binop, value);
+  return impl::scan::exclusive(r, std::forward<ExecPolicy>(p), begin(in),
+                               end(in), begin(out), binop, value);
 }
 ///
 template <typename ExecPolicy,
           typename InContainer,
           typename OutContainer,
-          typename T = RAJA::detail::ContainerVal<InContainer>,
+          typename T        = RAJA::detail::ContainerVal<InContainer>,
           typename Function = operators::plus<T>,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<InContainer>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, InContainer>>,
-                      type_traits::is_range<OutContainer>>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<InContainer>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, InContainer>>,
+    type_traits::is_range<OutContainer>>
 exclusive_scan(ExecPolicy&& p,
                InContainer&& in,
                OutContainer&& out,
-               Function binop = Function{},
-               T value = Function::identity())
+               Function binop = Function {},
+               T value        = Function::identity())
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::exclusive_scan(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<InContainer>(in),
-      std::forward<OutContainer>(out),
-      binop,
-      value);
+      std::forward<ExecPolicy>(p), r, std::forward<InContainer>(in),
+      std::forward<OutContainer>(out), binop, value);
 }
 
-}  // end inline namespace policy_by_value_interface
+}  // namespace policy_by_value_interface
 
 
 /*!
@@ -329,11 +327,11 @@ exclusive_scan(ExecPolicy&& p,
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
-          typename Res = typename resources::get_resource<ExecPolicy>::type >
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>>
+template <typename ExecPolicy,
+          typename... Args,
+          typename Res = typename resources::get_resource<ExecPolicy>::type>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>>
 exclusive_scan(Args&&... args)
 {
   Res r = Res::get_default();
@@ -342,10 +340,9 @@ exclusive_scan(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>,
+                                  type_traits::is_resource<Res>>
 exclusive_scan(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::exclusive_scan(
@@ -358,11 +355,11 @@ exclusive_scan(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
+template <typename ExecPolicy,
+          typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>>
 inclusive_scan(Args&&... args)
 {
   Res r = Res::get_default();
@@ -371,10 +368,9 @@ inclusive_scan(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>,
+                                  type_traits::is_resource<Res>>
 inclusive_scan(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::inclusive_scan(
@@ -387,11 +383,11 @@ inclusive_scan(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
+template <typename ExecPolicy,
+          typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>>
 exclusive_scan_inplace(Args&&... args)
 {
   Res r = Res::get_default();
@@ -400,10 +396,9 @@ exclusive_scan_inplace(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>,
+                                  type_traits::is_resource<Res>>
 exclusive_scan_inplace(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::exclusive_scan_inplace(
@@ -416,11 +411,11 @@ exclusive_scan_inplace(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
+template <typename ExecPolicy,
+          typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>>
 inclusive_scan_inplace(Args&&... args)
 {
   Res r = Res::get_default();
@@ -429,10 +424,9 @@ inclusive_scan_inplace(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>,
+                                  type_traits::is_resource<Res>>
 inclusive_scan_inplace(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::inclusive_scan_inplace(
diff --git a/include/RAJA/pattern/sort.hpp b/include/RAJA/pattern/sort.hpp
index acf3fe5ba7..fdbc5722ee 100644
--- a/include/RAJA/pattern/sort.hpp
+++ b/include/RAJA/pattern/sort.hpp
@@ -46,23 +46,21 @@ inline namespace policy_by_value_interface
 *
 ******************************************************************************
 */
-template <typename ExecPolicy,
-          typename Res,
-          typename Container,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
+template <
+    typename ExecPolicy,
+    typename Res,
+    typename Container,
+    typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>,
                       std::is_constructible<camp::resources::Resource, Res>,
                       type_traits::is_range<Container>>
-sort(ExecPolicy&& p,
-     Res r,
-     Container&& c,
-     Compare comp = Compare{})
+sort(ExecPolicy&& p, Res r, Container&& c, Compare comp = Compare {})
 {
   using std::begin;
-  using std::end;
   using std::distance;
+  using std::end;
   using T = RAJA::detail::ContainerVal<Container>;
   static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
                 "Compare must model BinaryFunction");
@@ -71,34 +69,35 @@ sort(ExecPolicy&& p,
 
   auto begin_it = begin(c);
   auto end_it   = end(c);
-  auto N = distance(begin_it, end_it);
+  auto N        = distance(begin_it, end_it);
 
-  if (N > 1) {
-    return impl::sort::unstable(r, std::forward<ExecPolicy>(p),
-                                begin_it, end_it, comp);
-  } else {
+  if (N > 1)
+  {
+    return impl::sort::unstable(r, std::forward<ExecPolicy>(p), begin_it,
+                                end_it, comp);
+  }
+  else
+  {
     return resources::EventProxy<Res>(r);
   }
 }
 ///
-template <typename ExecPolicy,
-          typename Container,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<Container>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, Container>>>
-sort(ExecPolicy&& p,
-     Container&& c,
-     Compare comp = Compare{})
+template <
+    typename ExecPolicy,
+    typename Container,
+    typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>,
+    typename Res     = typename resources::get_resource<ExecPolicy>::type>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<Container>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, Container>>>
+sort(ExecPolicy&& p, Container&& c, Compare comp = Compare {})
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::sort(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<Container>(c),
-      comp);
+      std::forward<ExecPolicy>(p), r, std::forward<Container>(c), comp);
 }
 
 /*!
@@ -113,23 +112,21 @@ sort(ExecPolicy&& p,
 *
 ******************************************************************************
 */
-template <typename ExecPolicy,
-          typename Res,
-          typename Container,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
+template <
+    typename ExecPolicy,
+    typename Res,
+    typename Container,
+    typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>,
                       std::is_constructible<camp::resources::Resource, Res>,
                       type_traits::is_range<Container>>
-stable_sort(ExecPolicy&& p,
-            Res r,
-            Container&& c,
-            Compare comp = Compare{})
+stable_sort(ExecPolicy&& p, Res r, Container&& c, Compare comp = Compare {})
 {
   using std::begin;
-  using std::end;
   using std::distance;
+  using std::end;
   using T = RAJA::detail::ContainerVal<Container>;
   static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
                 "Compare must model BinaryFunction");
@@ -138,34 +135,35 @@ stable_sort(ExecPolicy&& p,
 
   auto begin_it = begin(c);
   auto end_it   = end(c);
-  auto N = distance(begin_it, end_it);
+  auto N        = distance(begin_it, end_it);
 
-  if (N > 1) {
-    return impl::sort::stable(r, std::forward<ExecPolicy>(p),
-                              begin_it, end_it, comp);
-  } else {
+  if (N > 1)
+  {
+    return impl::sort::stable(r, std::forward<ExecPolicy>(p), begin_it, end_it,
+                              comp);
+  }
+  else
+  {
     return resources::EventProxy<Res>(r);
   }
 }
 ///
-template <typename ExecPolicy,
-          typename Container,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<Container>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, Container>>>
-stable_sort(ExecPolicy&& p,
-            Container&& c,
-            Compare comp = Compare{})
+template <
+    typename ExecPolicy,
+    typename Container,
+    typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>,
+    typename Res     = typename resources::get_resource<ExecPolicy>::type>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<Container>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, Container>>>
+stable_sort(ExecPolicy&& p, Container&& c, Compare comp = Compare {})
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::stable_sort(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<Container>(c),
-      comp);
+      std::forward<ExecPolicy>(p), r, std::forward<Container>(c), comp);
 }
 
 /*!
@@ -185,7 +183,8 @@ template <typename ExecPolicy,
           typename Res,
           typename KeyContainer,
           typename ValContainer,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
+          typename Compare =
+              operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>,
@@ -196,11 +195,11 @@ sort_pairs(ExecPolicy&& p,
            Res r,
            KeyContainer&& keys,
            ValContainer&& vals,
-           Compare comp = Compare{})
+           Compare comp = Compare {})
 {
   using std::begin;
-  using std::end;
   using std::distance;
+  using std::end;
   using T = RAJA::detail::ContainerVal<KeyContainer>;
   static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
                 "Compare must model BinaryFunction");
@@ -211,12 +210,15 @@ sort_pairs(ExecPolicy&& p,
 
   auto begin_key = begin(keys);
   auto end_key   = end(keys);
-  auto N = distance(begin_key, end_key);
+  auto N         = distance(begin_key, end_key);
 
-  if (N > 1) {
-    return impl::sort::unstable_pairs(r, std::forward<ExecPolicy>(p),
-                                      begin_key, end_key, begin(vals), comp);
-  } else {
+  if (N > 1)
+  {
+    return impl::sort::unstable_pairs(r, std::forward<ExecPolicy>(p), begin_key,
+                                      end_key, begin(vals), comp);
+  }
+  else
+  {
     return resources::EventProxy<Res>(r);
   }
 }
@@ -224,25 +226,25 @@ sort_pairs(ExecPolicy&& p,
 template <typename ExecPolicy,
           typename KeyContainer,
           typename ValContainer,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
+          typename Compare =
+              operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<KeyContainer>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, KeyContainer>>,
-                      type_traits::is_range<ValContainer>>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<KeyContainer>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, KeyContainer>>,
+    type_traits::is_range<ValContainer>>
 sort_pairs(ExecPolicy&& p,
            KeyContainer&& keys,
            ValContainer&& vals,
-           Compare comp = Compare{})
+           Compare comp = Compare {})
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::sort_pairs(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<KeyContainer>(keys),
-      std::forward<ValContainer>(vals),
-      comp);
+      std::forward<ExecPolicy>(p), r, std::forward<KeyContainer>(keys),
+      std::forward<ValContainer>(vals), comp);
 }
 
 /*!
@@ -262,7 +264,8 @@ template <typename ExecPolicy,
           typename Res,
           typename KeyContainer,
           typename ValContainer,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
+          typename Compare =
+              operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>,
@@ -273,11 +276,11 @@ stable_sort_pairs(ExecPolicy&& p,
                   Res r,
                   KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare{})
+                  Compare comp = Compare {})
 {
   using std::begin;
-  using std::end;
   using std::distance;
+  using std::end;
   using T = RAJA::detail::ContainerVal<KeyContainer>;
   static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
                 "Compare must model BinaryFunction");
@@ -288,12 +291,15 @@ stable_sort_pairs(ExecPolicy&& p,
 
   auto begin_key = begin(keys);
   auto end_key   = end(keys);
-  auto N = distance(begin_key, end_key);
+  auto N         = distance(begin_key, end_key);
 
-  if (N > 1) {
-    return impl::sort::stable_pairs(r, std::forward<ExecPolicy>(p),
-                                    begin_key, end_key, begin(vals), comp);
-  } else {
+  if (N > 1)
+  {
+    return impl::sort::stable_pairs(r, std::forward<ExecPolicy>(p), begin_key,
+                                    end_key, begin(vals), comp);
+  }
+  else
+  {
     return resources::EventProxy<Res>(r);
   }
 }
@@ -301,28 +307,28 @@ stable_sort_pairs(ExecPolicy&& p,
 template <typename ExecPolicy,
           typename KeyContainer,
           typename ValContainer,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
+          typename Compare =
+              operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<KeyContainer>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, KeyContainer>>,
-                      type_traits::is_range<ValContainer>>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<KeyContainer>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, KeyContainer>>,
+    type_traits::is_range<ValContainer>>
 stable_sort_pairs(ExecPolicy&& p,
                   KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare{})
+                  Compare comp = Compare {})
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::stable_sort_pairs(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<KeyContainer>(keys),
-      std::forward<ValContainer>(vals),
-      comp);
+      std::forward<ExecPolicy>(p), r, std::forward<KeyContainer>(keys),
+      std::forward<ValContainer>(vals), comp);
 }
 
-}  // end inline namespace policy_by_value_interface
+}  // namespace policy_by_value_interface
 
 // =============================================================================
 
@@ -332,11 +338,12 @@ stable_sort_pairs(ExecPolicy&& p,
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
+template <typename ExecPolicy,
+          typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>>
-sort(Args &&... args)
+sort(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::sort<ExecPolicy>(
@@ -347,10 +354,10 @@ template <typename ExecPolicy, typename Res, typename... Args>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>>
-sort(Res r, Args &&... args)
+sort(Res r, Args&&... args)
 {
-  return ::RAJA::policy_by_value_interface::sort(
-      ExecPolicy(), r, std::forward<Args>(args)...);
+  return ::RAJA::policy_by_value_interface::sort(ExecPolicy(), r,
+                                                 std::forward<Args>(args)...);
 }
 
 /*!
@@ -359,11 +366,12 @@ sort(Res r, Args &&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
+template <typename ExecPolicy,
+          typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>>
-stable_sort(Args &&... args)
+stable_sort(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::stable_sort<ExecPolicy>(
@@ -374,7 +382,7 @@ template <typename ExecPolicy, typename Res, typename... Args>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>>
-stable_sort(Res r, Args &&... args)
+stable_sort(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::stable_sort(
       ExecPolicy(), r, std::forward<Args>(args)...);
@@ -386,11 +394,12 @@ stable_sort(Res r, Args &&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
+template <typename ExecPolicy,
+          typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>>
-sort_pairs(Args &&... args)
+sort_pairs(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::sort_pairs<ExecPolicy>(
@@ -401,7 +410,7 @@ template <typename ExecPolicy, typename Res, typename... Args>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>>
-sort_pairs(Res r, Args &&... args)
+sort_pairs(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::sort_pairs(
       ExecPolicy(), r, std::forward<Args>(args)...);
@@ -413,11 +422,12 @@ sort_pairs(Res r, Args &&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
+template <typename ExecPolicy,
+          typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>>
-stable_sort_pairs(Args &&... args)
+stable_sort_pairs(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::stable_sort_pairs<ExecPolicy>(
@@ -428,7 +438,7 @@ template <typename ExecPolicy, typename Res, typename... Args>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>>
-stable_sort_pairs(Res r, Args &&... args)
+stable_sort_pairs(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::stable_sort_pairs(
       ExecPolicy(), r, std::forward<Args>(args)...);
diff --git a/include/RAJA/pattern/synchronize.hpp b/include/RAJA/pattern/synchronize.hpp
index d3e42af81c..77c88e5c6d 100644
--- a/include/RAJA/pattern/synchronize.hpp
+++ b/include/RAJA/pattern/synchronize.hpp
@@ -41,7 +41,7 @@ namespace RAJA
 template <typename Policy>
 void synchronize()
 {
-  synchronize_impl(Policy{});
+  synchronize_impl(Policy {});
 }
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/MatrixRegister.hpp b/include/RAJA/pattern/tensor/MatrixRegister.hpp
index 9fa39f34ee..ab6d2f7c42 100644
--- a/include/RAJA/pattern/tensor/MatrixRegister.hpp
+++ b/include/RAJA/pattern/tensor/MatrixRegister.hpp
@@ -28,25 +28,27 @@ namespace RAJA
 {
 namespace expt
 {
-  template<typename T, typename LAYOUT, typename REGISTER_POLICY = default_register>
-  using SquareMatrixRegister =
-      TensorRegister<REGISTER_POLICY,
-                     T,
-                     LAYOUT,
-                     camp::idx_seq<RAJA::internal::expt::RegisterTraits<REGISTER_POLICY,T>::s_num_elem,
-                                   RAJA::internal::expt::RegisterTraits<REGISTER_POLICY,T>::s_num_elem>>;
-
-  template<typename T, typename LAYOUT, camp::idx_t ROWS, camp::idx_t COLS,
-           typename REGISTER_POLICY = default_register>
-  using RectMatrixRegister =
-      TensorRegister<REGISTER_POLICY,
-                     T,
-                     LAYOUT,
-                     camp::idx_seq<ROWS,COLS>>;
-
-} // namespace expt
+template <typename T,
+          typename LAYOUT,
+          typename REGISTER_POLICY = default_register>
+using SquareMatrixRegister = TensorRegister<
+    REGISTER_POLICY,
+    T,
+    LAYOUT,
+    camp::idx_seq<
+        RAJA::internal::expt::RegisterTraits<REGISTER_POLICY, T>::s_num_elem,
+        RAJA::internal::expt::RegisterTraits<REGISTER_POLICY, T>::s_num_elem>>;
+
+template <typename T,
+          typename LAYOUT,
+          camp::idx_t ROWS,
+          camp::idx_t COLS,
+          typename REGISTER_POLICY = default_register>
+using RectMatrixRegister =
+    TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<ROWS, COLS>>;
+
+}  // namespace expt
 }  // namespace RAJA
 
 
-
 #endif
diff --git a/include/RAJA/pattern/tensor/ScalarRegister.hpp b/include/RAJA/pattern/tensor/ScalarRegister.hpp
index f6675b4ba9..d532d58ade 100644
--- a/include/RAJA/pattern/tensor/ScalarRegister.hpp
+++ b/include/RAJA/pattern/tensor/ScalarRegister.hpp
@@ -28,16 +28,14 @@ namespace RAJA
 namespace expt
 {
 
-  // Convenience to describe ScalarTensors
-  template<typename T>
-  using ScalarRegister = TensorRegister<scalar_register,
-                                        T,
-                                        ScalarLayout,
-                                        camp::idx_seq<>>;
+// Convenience to describe ScalarTensors
+template <typename T>
+using ScalarRegister =
+    TensorRegister<scalar_register, T, ScalarLayout, camp::idx_seq<>>;
 
 
-} // namespace expt
-} // namespace RAJA
+}  // namespace expt
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/TensorBlock.hpp b/include/RAJA/pattern/tensor/TensorBlock.hpp
index 0e9869a772..6fc9d48897 100644
--- a/include/RAJA/pattern/tensor/TensorBlock.hpp
+++ b/include/RAJA/pattern/tensor/TensorBlock.hpp
@@ -360,7 +360,6 @@ namespace ET{
 }  // namespace RAJA
 
 
-
 #endif
 
 #endif
diff --git a/include/RAJA/pattern/tensor/TensorIndex.hpp b/include/RAJA/pattern/tensor/TensorIndex.hpp
index 8f152d92ce..c384465a15 100644
--- a/include/RAJA/pattern/tensor/TensorIndex.hpp
+++ b/include/RAJA/pattern/tensor/TensorIndex.hpp
@@ -29,196 +29,190 @@ namespace expt
 {
 
 
-  template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, strip_index_type_t<IDX> INDEX_VALUE, strip_index_type_t<IDX> LENGTH_VALUE>
-  struct StaticTensorIndexInner;
-
-  template<typename INNER_TYPE>
-  struct StaticTensorIndex;
-
-
-  template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM>
-  class TensorIndex {
-    public:
-      using self_type = TensorIndex<IDX, TENSOR_TYPE, DIM>;
-      using value_type = strip_index_type_t<IDX>;
-      using index_type = IDX;
-      using tensor_type = TENSOR_TYPE;
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      self_type all(){
-        return self_type(index_type(-1), value_type(-1));
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,value_type(-1),value_type(-1)>> static_all(){
-        return StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,value_type(-1),value_type(-1)>>();
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      self_type range(index_type begin, index_type end){
-        return self_type(begin, value_type(stripIndexType(end-begin)));
-      }
-
-      template<value_type TBEGIN, value_type TEND>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,TBEGIN,TEND-TBEGIN>> static_range(){
-        return StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,TBEGIN,TEND-TBEGIN>>();
-      }
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex() : m_index(index_type(0)), m_length(0) {}
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex(RAJA::TypedRangeSegment<IDX> const &seg) :
-      m_index(*seg.begin()), m_length(seg.size())
-      {}
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex(index_type value, value_type length) : m_index(value), m_length(length) {}
-
-      template<typename T, camp::idx_t D>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex(TensorIndex<IDX, T, D> const &c) : m_index(*c), m_length(c.size()) {}
-
-
-      template<value_type IDX_VAL, value_type LEN_VAL>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex(StaticTensorIndex<StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, IDX_VAL, LEN_VAL>> const RAJA_UNUSED_ARG(&c))
-          : m_index(IDX_VAL)
-          , m_length(LEN_VAL)
-      {}
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      index_type const &operator*() const {
-        return m_index;
-      }
-
-      // used in strip_by_value as a static cast
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      explicit operator index_type() const {
-        // return does not matter, but suppresses no-return warnings
-        return m_index;
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      index_type begin() const {
-        return m_index;
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      value_type size() const {
-        return m_length;
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      value_type dim() const {
-        return DIM;
-      }
-
-    private:
-      index_type m_index;
-      value_type m_length;
-  };
-
-
-  template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, strip_index_type_t<IDX> INDEX_VALUE, strip_index_type_t<IDX> LENGTH_VALUE>
-  struct StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,INDEX_VALUE,LENGTH_VALUE>> {
-
-      using base_type  = TensorIndex<IDX,TENSOR_TYPE,DIM>;
-      using value_type = strip_index_type_t<IDX>;
-      using index_type = IDX;
-      using tensor_type = TENSOR_TYPE;
-
-      static const index_type s_index  = INDEX_VALUE;
-      static const index_type s_length = LENGTH_VALUE;
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr operator base_type() {
-        return base_type(s_index,s_length);
-      }
-    
-  };
-
-
-
-  /*!
-   * Index that specifies the starting element index of a Vector
-   */
-  template<typename IDX, typename VECTOR_TYPE>
-  using VectorIndex =  TensorIndex<IDX, VECTOR_TYPE, 0>;
-
-  /*!
-   * Index that specifies the starting Row index of a matrix
-   */
-  template<typename IDX, typename MATRIX_TYPE>
-  using RowIndex =  TensorIndex<IDX, MATRIX_TYPE, 0>;
-
-  /*!
-   * Index that specifies the starting Column index of a matrix
-   */
-  template<typename IDX, typename MATRIX_TYPE>
-  using ColIndex =  TensorIndex<IDX, MATRIX_TYPE, 1>;
-
-
-  /*!
-   * Converts a Row index to a Column index
-   */
-  template<typename IDX, typename MATRIX_TYPE>
+template <typename IDX,
+          typename TENSOR_TYPE,
+          camp::idx_t DIM,
+          strip_index_type_t<IDX> INDEX_VALUE,
+          strip_index_type_t<IDX> LENGTH_VALUE>
+struct StaticTensorIndexInner;
+
+template <typename INNER_TYPE>
+struct StaticTensorIndex;
+
+
+template <typename IDX, typename TENSOR_TYPE, camp::idx_t DIM>
+class TensorIndex
+{
+public:
+  using self_type   = TensorIndex<IDX, TENSOR_TYPE, DIM>;
+  using value_type  = strip_index_type_t<IDX>;
+  using index_type  = IDX;
+  using tensor_type = TENSOR_TYPE;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr self_type all()
+  {
+    return self_type(index_type(-1), value_type(-1));
+  }
+
+  RAJA_INLINE
   RAJA_HOST_DEVICE
+  static constexpr StaticTensorIndex<StaticTensorIndexInner<IDX,
+                                                            TENSOR_TYPE,
+                                                            DIM,
+                                                            value_type(-1),
+                                                            value_type(-1)>>
+  static_all()
+  {
+    return StaticTensorIndex<StaticTensorIndexInner<
+        IDX, TENSOR_TYPE, DIM, value_type(-1), value_type(-1)>>();
+  }
+
   RAJA_INLINE
-  constexpr
-  ColIndex<IDX, MATRIX_TYPE> toColIndex(RowIndex<IDX, MATRIX_TYPE> const &r){
-    return ColIndex<IDX, MATRIX_TYPE>(*r, r.size());
+  RAJA_HOST_DEVICE
+  static constexpr self_type range(index_type begin, index_type end)
+  {
+    return self_type(begin, value_type(stripIndexType(end - begin)));
   }
 
-  /*!
-   * Converts a Column index to a Row index
-   */
-  template<typename IDX, typename MATRIX_TYPE>
+  template <value_type TBEGIN, value_type TEND>
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr StaticTensorIndex<
+      StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, TBEGIN, TEND - TBEGIN>>
+  static_range()
+  {
+    return StaticTensorIndex<
+        StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, TBEGIN, TEND - TBEGIN>>();
+  }
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr TensorIndex() : m_index(index_type(0)), m_length(0) {}
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr TensorIndex(RAJA::TypedRangeSegment<IDX> const& seg)
+      : m_index(*seg.begin()), m_length(seg.size())
+  {}
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr TensorIndex(index_type value, value_type length)
+      : m_index(value), m_length(length)
+  {}
+
+  template <typename T, camp::idx_t D>
+  RAJA_INLINE
+      RAJA_HOST_DEVICE constexpr TensorIndex(TensorIndex<IDX, T, D> const& c)
+      : m_index(*c), m_length(c.size())
+  {}
+
+
+  template <value_type IDX_VAL, value_type LEN_VAL>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr TensorIndex(
+      StaticTensorIndex<
+          StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, IDX_VAL, LEN_VAL>> const
+          RAJA_UNUSED_ARG(&c))
+      : m_index(IDX_VAL), m_length(LEN_VAL)
+  {}
+
+
+  RAJA_INLINE
   RAJA_HOST_DEVICE
+  constexpr index_type const& operator*() const { return m_index; }
+
+  // used in strip_by_value as a static cast
   RAJA_INLINE
-  constexpr
-  RowIndex<IDX, MATRIX_TYPE> toRowIndex(ColIndex<IDX, MATRIX_TYPE> const &c){
-    return RowIndex<IDX, MATRIX_TYPE>(*c, c.size());
+  RAJA_HOST_DEVICE
+  constexpr explicit operator index_type() const
+  {
+    // return does not matter, but suppresses no-return warnings
+    return m_index;
   }
 
-} // namespace expt
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type begin() const { return m_index; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr value_type size() const { return m_length; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr value_type dim() const { return DIM; }
+
+private:
+  index_type m_index;
+  value_type m_length;
+};
+
+
+template <typename IDX,
+          typename TENSOR_TYPE,
+          camp::idx_t DIM,
+          strip_index_type_t<IDX> INDEX_VALUE,
+          strip_index_type_t<IDX> LENGTH_VALUE>
+struct StaticTensorIndex<
+    StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, INDEX_VALUE, LENGTH_VALUE>>
+{
+
+  using base_type   = TensorIndex<IDX, TENSOR_TYPE, DIM>;
+  using value_type  = strip_index_type_t<IDX>;
+  using index_type  = IDX;
+  using tensor_type = TENSOR_TYPE;
+
+  static const index_type s_index  = INDEX_VALUE;
+  static const index_type s_length = LENGTH_VALUE;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr operator base_type() { return base_type(s_index, s_length); }
+};
+
+
+/*!
+ * Index that specifies the starting element index of a Vector
+ */
+template <typename IDX, typename VECTOR_TYPE>
+using VectorIndex = TensorIndex<IDX, VECTOR_TYPE, 0>;
+
+/*!
+ * Index that specifies the starting Row index of a matrix
+ */
+template <typename IDX, typename MATRIX_TYPE>
+using RowIndex = TensorIndex<IDX, MATRIX_TYPE, 0>;
+
+/*!
+ * Index that specifies the starting Column index of a matrix
+ */
+template <typename IDX, typename MATRIX_TYPE>
+using ColIndex = TensorIndex<IDX, MATRIX_TYPE, 1>;
+
+
+/*!
+ * Converts a Row index to a Column index
+ */
+template <typename IDX, typename MATRIX_TYPE>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr ColIndex<IDX, MATRIX_TYPE>
+toColIndex(RowIndex<IDX, MATRIX_TYPE> const& r)
+{
+  return ColIndex<IDX, MATRIX_TYPE>(*r, r.size());
+}
+
+/*!
+ * Converts a Column index to a Row index
+ */
+template <typename IDX, typename MATRIX_TYPE>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr RowIndex<IDX, MATRIX_TYPE>
+toRowIndex(ColIndex<IDX, MATRIX_TYPE> const& c)
+{
+  return RowIndex<IDX, MATRIX_TYPE>(*c, c.size());
+}
+
+}  // namespace expt
 }  // namespace RAJA
 
 #include "RAJA/pattern/tensor/internal/TensorIndexTraits.hpp"
diff --git a/include/RAJA/pattern/tensor/TensorLayout.hpp b/include/RAJA/pattern/tensor/TensorLayout.hpp
index 376d6b905a..8e2404c3a2 100644
--- a/include/RAJA/pattern/tensor/TensorLayout.hpp
+++ b/include/RAJA/pattern/tensor/TensorLayout.hpp
@@ -28,67 +28,56 @@ namespace expt
 {
 
 
-  template<camp::idx_t ... DIM_SEQ>
-  struct TensorLayout : public camp::idx_seq<DIM_SEQ...>
-  {
+template <camp::idx_t... DIM_SEQ>
+struct TensorLayout : public camp::idx_seq<DIM_SEQ...>
+{
 
-      using seq_t = camp::idx_seq<DIM_SEQ...>;
+  using seq_t = camp::idx_seq<DIM_SEQ...>;
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      bool is_column_major(){
-        return false;
-      }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr bool is_column_major() { return false; }
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      bool is_row_major(){
-        return false;
-      }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr bool is_row_major() { return false; }
+};
 
-  };
 
+// specialization for Matrix layouts, where column vs row major matters
+template <camp::idx_t S2, camp::idx_t S1>
+struct TensorLayout<S2, S1> : public camp::idx_seq<S2, S1>
+{
+  using seq_t = camp::idx_seq<S2, S1>;
 
-  // specialization for Matrix layouts, where column vs row major matters
-  template<camp::idx_t S2, camp::idx_t S1>
-  struct TensorLayout<S2, S1> : public camp::idx_seq<S2, S1>
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr bool is_column_major()
   {
-      using seq_t = camp::idx_seq<S2, S1>;
+    return S1 == 0;  // Rows are stride-1
+  }
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      bool is_column_major(){
-        return S1 == 0; // Rows are stride-1
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      bool is_row_major(){
-        return S1 == 1; // Columns are stride-1
-      }
-  };
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr bool is_row_major()
+  {
+    return S1 == 1;  // Columns are stride-1
+  }
+};
 
 
-  // 0d tensor (scalar) layout
-  using ScalarLayout = TensorLayout<>;
+// 0d tensor (scalar) layout
+using ScalarLayout = TensorLayout<>;
 
-  // 1d tensor (vector) layout
-  using VectorLayout = TensorLayout<0>;
+// 1d tensor (vector) layout
+using VectorLayout = TensorLayout<0>;
 
-  // 2d tensor (matrix) layouts
-  using RowMajorLayout = TensorLayout<0, 1>;
-  using ColMajorLayout = TensorLayout<1, 0>;
+// 2d tensor (matrix) layouts
+using RowMajorLayout = TensorLayout<0, 1>;
+using ColMajorLayout = TensorLayout<1, 0>;
 
 
-} // namespace expt
+}  // namespace expt
 }  // namespace RAJA
 
 
diff --git a/include/RAJA/pattern/tensor/TensorRegister.hpp b/include/RAJA/pattern/tensor/TensorRegister.hpp
index d410f46fb7..22f4c16cae 100644
--- a/include/RAJA/pattern/tensor/TensorRegister.hpp
+++ b/include/RAJA/pattern/tensor/TensorRegister.hpp
@@ -28,80 +28,91 @@
 
 namespace RAJA
 {
-namespace internal {
-namespace expt {
-    class TensorRegisterConcreteBase;
-}
+namespace internal
+{
+namespace expt
+{
+class TensorRegisterConcreteBase;
 }
+}  // namespace internal
 
 namespace expt
 {
 
 
-  template<typename REGISTER_POLICY,
-           typename T,
-           typename LAYOUT,
-           typename SIZES>
-  class TensorRegister;
+template <typename REGISTER_POLICY, typename T, typename LAYOUT, typename SIZES>
+class TensorRegister;
 
 
-  /*
-   * Overload for:    arithmetic + TensorRegister
+/*
+ * Overload for:    arithmetic + TensorRegister
 
-   */
-  template<typename LEFT, typename RIGHT,
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator+(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return RIGHT(lhs).add(rhs);
-  }
-
-  /*
-   * Overload for:    arithmetic - TensorRegister
-
-   */
-  template<typename LEFT, typename RIGHT,
+    typename std::enable_if<
+        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
+                        RIGHT>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator+(LEFT const& lhs, RIGHT const& rhs)
+{
+  return RIGHT(lhs).add(rhs);
+}
+
+/*
+ * Overload for:    arithmetic - TensorRegister
+
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator-(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return RIGHT(lhs).subtract(rhs);
-  }
-
-  /*
-   * Overload for:    arithmetic * TensorRegister
-
-   */
-  template<typename LEFT, typename RIGHT,
+    typename std::enable_if<
+        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
+                        RIGHT>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator-(LEFT const& lhs, RIGHT const& rhs)
+{
+  return RIGHT(lhs).subtract(rhs);
+}
+
+/*
+ * Overload for:    arithmetic * TensorRegister
+
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator*(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return rhs.scale(lhs);
-  }
-
-  /*
-   * Overload for:    arithmetic / TensorRegister
-
-   */
-  template<typename LEFT, typename RIGHT,
+    typename std::enable_if<
+        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
+                        RIGHT>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator*(LEFT const& lhs, RIGHT const& rhs)
+{
+  return rhs.scale(lhs);
+}
+
+/*
+ * Overload for:    arithmetic / TensorRegister
+
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator/(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return RIGHT(lhs).divide(rhs);
-  }
-
-} // namespace expt
+    typename std::enable_if<
+        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
+                        RIGHT>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator/(LEFT const& lhs, RIGHT const& rhs)
+{
+  return RIGHT(lhs).divide(rhs);
+}
+
+}  // namespace expt
 }  // namespace RAJA
 
 
diff --git a/include/RAJA/pattern/tensor/VectorRegister.hpp b/include/RAJA/pattern/tensor/VectorRegister.hpp
index afab05658f..8041622d11 100644
--- a/include/RAJA/pattern/tensor/VectorRegister.hpp
+++ b/include/RAJA/pattern/tensor/VectorRegister.hpp
@@ -24,16 +24,15 @@ namespace RAJA
 {
 namespace expt
 {
-  // Convenience to describe VectorTensors
-  template<typename T, typename REGISTER_POLICY = default_register, camp::idx_t NUM_ELEM = Register<T,REGISTER_POLICY>::s_num_elem>
-  using VectorRegister = TensorRegister<REGISTER_POLICY,
-                                        T,
-                                        VectorLayout,
-                                        camp::idx_seq<NUM_ELEM> >;
-} // namespace expt
-
-} // namespace RAJA
-
+// Convenience to describe VectorTensors
+template <typename T,
+          typename REGISTER_POLICY = default_register,
+          camp::idx_t NUM_ELEM     = Register<T, REGISTER_POLICY>::s_num_elem>
+using VectorRegister =
+    TensorRegister<REGISTER_POLICY, T, VectorLayout, camp::idx_seq<NUM_ELEM>>;
+}  // namespace expt
+
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp b/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
index 953f4fd4a0..09099eef27 100644
--- a/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
@@ -34,110 +34,121 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+
+template <typename OPERATOR, typename LEFT_OPERAND, typename RIGHT_OPERAND>
+class TensorBinaryOperator
+    : public TensorExpressionBase<
+          TensorBinaryOperator<OPERATOR, LEFT_OPERAND, RIGHT_OPERAND>>
+{
+public:
+  using self_type = TensorBinaryOperator<OPERATOR, LEFT_OPERAND, RIGHT_OPERAND>;
+  using operator_type      = OPERATOR;
+  using left_operand_type  = LEFT_OPERAND;
+  using right_operand_type = RIGHT_OPERAND;
+
+  using element_type = typename LEFT_OPERAND::element_type;
+  using index_type   = typename LEFT_OPERAND::index_type;
+
+  using operator_traits = OperatorTraits<LEFT_OPERAND, RIGHT_OPERAND>;
+  using result_type     = typename operator_traits::result_type;
+
+  static constexpr camp::idx_t s_num_dims = operator_traits::s_num_dims;
+
+private:
+  left_operand_type m_left_operand;
+  right_operand_type m_right_operand;
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorBinaryOperator(left_operand_type const& left,
+                       right_operand_type const& right)
+      : m_left_operand {left}, m_right_operand {right}
+  {}
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr auto getDimSize(camp::idx_t dim) const
+      -> decltype(operator_traits::getDimSize(dim,
+                                              m_left_operand,
+                                              m_right_operand))
+  {
+    return operator_traits::getDimSize(dim, m_left_operand, m_right_operand);
+  }
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE const& tile) const
+      -> decltype(operator_type::eval(m_left_operand.eval(tile),
+                                      m_right_operand.eval(tile)))
+  {
+    return operator_type::eval(m_left_operand.eval(tile),
+                               m_right_operand.eval(tile));
+  }
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const
   {
+    operator_type::print_ast();
+    printf("[");
+    operator_type::print_ast();
+    printf("](");
+    m_left_operand.print_ast();
+    printf(", ");
+    m_right_operand.print_ast();
+    printf(")");
+  }
+};
 
 
-    template<typename OPERATOR, typename LEFT_OPERAND, typename RIGHT_OPERAND>
-    class TensorBinaryOperator :
-        public TensorExpressionBase<TensorBinaryOperator<OPERATOR, LEFT_OPERAND, RIGHT_OPERAND>>
-    {
-      public:
-        using self_type = TensorBinaryOperator<OPERATOR, LEFT_OPERAND, RIGHT_OPERAND>;
-        using operator_type = OPERATOR;
-        using left_operand_type = LEFT_OPERAND;
-        using right_operand_type = RIGHT_OPERAND;
-
-        using element_type = typename LEFT_OPERAND::element_type;
-        using index_type = typename LEFT_OPERAND::index_type;
-
-        using operator_traits = OperatorTraits<LEFT_OPERAND, RIGHT_OPERAND>;
-        using result_type = typename operator_traits::result_type;
-
-        static constexpr camp::idx_t s_num_dims =
-            operator_traits::s_num_dims;
-
-      private:
-        left_operand_type m_left_operand;
-        right_operand_type m_right_operand;
-
-      public:
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorBinaryOperator(left_operand_type const &left, right_operand_type const &right) :
-        m_left_operand{left}, m_right_operand{right}
-        {}
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        auto getDimSize(camp::idx_t dim) const ->
-        decltype(operator_traits::getDimSize(dim, m_left_operand, m_right_operand))
-        {
-          return operator_traits::getDimSize(dim, m_left_operand, m_right_operand);
-        }
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval(TILE_TYPE const &tile) const ->
-          decltype(operator_type::eval(m_left_operand.eval(tile), m_right_operand.eval(tile)))
-        {
-          return operator_type::eval(m_left_operand.eval(tile), m_right_operand.eval(tile));
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          operator_type::print_ast();
-          printf("[");
-          operator_type::print_ast();
-          printf("](");
-          m_left_operand.print_ast();
-          printf(", ");
-          m_right_operand.print_ast();
-          printf(")");
-        }
-
-
-    };
-
-
-
-
-    /*
-     * Overload for:    arithmetic + tensorexpression
-
-     */
-    template<typename LEFT_OPERAND, typename RIGHT_OPERAND,
-      typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value, bool>::type = true,
-      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RIGHT_OPERAND>::value, bool>::type = true>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    auto operator+(LEFT_OPERAND const &left, RIGHT_OPERAND const &right) ->
-    TensorAdd<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>
-    {
-      return TensorAdd<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left), right);
-    }
-
-
-    /*
-     * Overload for:    arithmetic - tensorexpression
-
-     */
-    template<typename LEFT_OPERAND, typename RIGHT_OPERAND,
-      typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value, bool>::type = true,
-      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RIGHT_OPERAND>::value, bool>::type = true>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    auto operator-(LEFT_OPERAND const &left, RIGHT_OPERAND const &right) ->
-    TensorSubtract<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>
-    {
-      return TensorSubtract<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left), right);
-    }
+/*
+ * Overload for:    arithmetic + tensorexpression
+
+ */
+template <typename LEFT_OPERAND,
+          typename RIGHT_OPERAND,
+          typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value,
+                                  bool>::type = true,
+          typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase,
+                                                  RIGHT_OPERAND>::value,
+                                  bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE auto operator+(LEFT_OPERAND const& left,
+                                            RIGHT_OPERAND const& right)
+    -> TensorAdd<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+                 RIGHT_OPERAND>
+{
+  return TensorAdd<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+                   RIGHT_OPERAND>(
+      NormalizeOperandHelper<LEFT_OPERAND>::normalize(left), right);
+}
+
+
+/*
+ * Overload for:    arithmetic - tensorexpression
+
+ */
+template <typename LEFT_OPERAND,
+          typename RIGHT_OPERAND,
+          typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value,
+                                  bool>::type = true,
+          typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase,
+                                                  RIGHT_OPERAND>::value,
+                                  bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE auto operator-(LEFT_OPERAND const& left,
+                                            RIGHT_OPERAND const& right)
+    -> TensorSubtract<
+        typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+        RIGHT_OPERAND>
+{
+  return TensorSubtract<
+      typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+      RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left),
+                     right);
+}
 
 
 //    /*
@@ -145,21 +156,27 @@ namespace expt
 //
 //     */
 //    template<typename LEFT_OPERAND, typename RIGHT_OPERAND,
-//      typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value, bool>::type = true,
-//      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RIGHT_OPERAND>::value, bool>::type = true>
+//      typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value,
+//      bool>::type = true, typename
+//      std::enable_if<std::is_base_of<TensorExpressionConcreteBase,
+//      RIGHT_OPERAND>::value, bool>::type = true>
 //    RAJA_INLINE
 //    RAJA_HOST_DEVICE
 //    auto operator/(LEFT_OPERAND const &left, RIGHT_OPERAND const &right) ->
-//    TensorDivide<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>
+//    TensorDivide<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+//    RIGHT_OPERAND>
 //    {
-//      return TensorDivide<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left), right);
+//      return TensorDivide<typename
+//      NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+//      RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left),
+//      right);
 //    }
 
 
-  } // namespace ET
+}  // namespace ET
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp b/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp
index a1450bf19f..52fbf83cfa 100644
--- a/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp
@@ -27,159 +27,133 @@ namespace expt
 {
 
 
-  namespace ET
-  {
-
-    struct TensorOperatorAdd
-    {
-
-      template<typename LEFT, typename RIGHT>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      auto eval(LEFT const &left, RIGHT const &right) ->
-        decltype(left + right)
-      {
-        return left + right;
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void print_ast(){
-        printf("Add");
-      }
-    };
-
-    struct TensorOperatorSubtract
-    {
-
-      template<typename LEFT, typename RIGHT>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      auto eval(LEFT const &left, RIGHT const &right) ->
-        decltype(left - right)
-      {
-        return left - right;
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void print_ast(){
-        printf("Subtract");
-      }
-    };
-
-
-
+namespace ET
+{
 
+struct TensorOperatorAdd
+{
 
-    template<typename OPERATOR, typename LEFT_OPERAND, typename RIGHT_OPERAND>
-    class TensorBinaryOperator;
+  template <typename LEFT, typename RIGHT>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto eval(LEFT const& left,
+                                                RIGHT const& right)
+      -> decltype(left + right)
+  {
+    return left + right;
+  }
 
-    template<typename LHS, typename RHS>
-    using TensorAdd = TensorBinaryOperator<TensorOperatorAdd, LHS, RHS>;
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Add"); }
+};
 
-    template<typename LHS, typename RHS>
-    using TensorSubtract = TensorBinaryOperator<TensorOperatorSubtract, LHS, RHS>;
+struct TensorOperatorSubtract
+{
 
+  template <typename LEFT, typename RIGHT>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto eval(LEFT const& left,
+                                                RIGHT const& right)
+      -> decltype(left - right)
+  {
+    return left - right;
+  }
 
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Subtract"); }
+};
 
 
-    /*!
-     * Provides default operations for add, subtract and divide
-     *
-     * For the most part, this is just element wise operations between
-     * compatible tensors.
-     *
-     * There are specializations that handle when one operand is a scalar
-     */
-    template<typename LHS_TYPE, typename RHS_TYPE, class ENABLE = void>
-    struct OperatorTraits {
+template <typename OPERATOR, typename LEFT_OPERAND, typename RIGHT_OPERAND>
+class TensorBinaryOperator;
 
-        using result_type = typename LHS_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = LHS_TYPE::s_num_dims;
+template <typename LHS, typename RHS>
+using TensorAdd = TensorBinaryOperator<TensorOperatorAdd, LHS, RHS>;
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Elemental");
-        }
+template <typename LHS, typename RHS>
+using TensorSubtract = TensorBinaryOperator<TensorOperatorSubtract, LHS, RHS>;
 
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LHS_TYPE const &lhs, RHS_TYPE const &rhs) {
-          return dim == 0 ? lhs.getDimSize(0) : rhs.getDimSize(1);
-        }
+/*!
+ * Provides default operations for add, subtract and divide
+ *
+ * For the most part, this is just element wise operations between
+ * compatible tensors.
+ *
+ * There are specializations that handle when one operand is a scalar
+ */
+template <typename LHS_TYPE, typename RHS_TYPE, class ENABLE = void>
+struct OperatorTraits
+{
 
-    };
+  using result_type                       = typename LHS_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LHS_TYPE::s_num_dims;
 
-    /*!
-     * Specialization when the left operand is a scalar
-     */
-    template<typename LHS_TYPE, typename RHS_TYPE>
-    struct OperatorTraits<LHS_TYPE, RHS_TYPE,
-    typename std::enable_if<LHS_TYPE::s_num_dims == 0>::type>
-    {
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Elemental"); }
 
-        using result_type = typename RHS_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = RHS_TYPE::s_num_dims;
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Scalar");
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LHS_TYPE const &, RHS_TYPE const &rhs) {
-          return rhs.getDimSize(dim);
-        }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim, LHS_TYPE const& lhs, RHS_TYPE const& rhs)
+  {
+    return dim == 0 ? lhs.getDimSize(0) : rhs.getDimSize(1);
+  }
+};
 
-    };
+/*!
+ * Specialization when the left operand is a scalar
+ */
+template <typename LHS_TYPE, typename RHS_TYPE>
+struct OperatorTraits<LHS_TYPE,
+                      RHS_TYPE,
+                      typename std::enable_if<LHS_TYPE::s_num_dims == 0>::type>
+{
 
-    /*!
-     * Specialization when the right operand is a scalar
-     */
-    template<typename LHS_TYPE, typename RHS_TYPE>
-    struct OperatorTraits<LHS_TYPE, RHS_TYPE,
-    typename std::enable_if<RHS_TYPE::s_num_dims == 0>::type>
-    {
+  using result_type                       = typename RHS_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = RHS_TYPE::s_num_dims;
 
-        using result_type = typename LHS_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = LHS_TYPE::s_num_dims;
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Scalar"); }
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Scalar");
-        }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim, LHS_TYPE const&, RHS_TYPE const& rhs)
+  {
+    return rhs.getDimSize(dim);
+  }
+};
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LHS_TYPE const &lhs, RHS_TYPE const &) {
-          return lhs.getDimSize(dim);
-        }
+/*!
+ * Specialization when the right operand is a scalar
+ */
+template <typename LHS_TYPE, typename RHS_TYPE>
+struct OperatorTraits<LHS_TYPE,
+                      RHS_TYPE,
+                      typename std::enable_if<RHS_TYPE::s_num_dims == 0>::type>
+{
 
+  using result_type                       = typename LHS_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LHS_TYPE::s_num_dims;
 
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Scalar"); }
 
-    };
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim, LHS_TYPE const& lhs, RHS_TYPE const&)
+  {
+    return lhs.getDimSize(dim);
+  }
+};
 
 
-  } // namespace ET
+}  // namespace ET
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp b/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp
index 210414eaec..c61cfd0891 100644
--- a/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp
@@ -26,7 +26,6 @@
 #include "RAJA/pattern/tensor/internal/TensorRef.hpp"
 
 
-
 namespace RAJA
 {
 namespace internal
@@ -35,93 +34,90 @@ namespace expt
 {
 
 
-  namespace ET
-  {
-
-
-    /*!
-     * Temporary n-dimensional memory.
-     *
-     * STORAGE_TYPE defines the memory storage
-     * TENSOR_TYPE defines what kind of tensor is returned by eval()
-     */
-    template<typename STORAGE_TYPE, typename TENSOR_TYPE>
-    class BlockLiteral :  public TensorExpressionBase<BlockLiteral<STORAGE_TYPE, TENSOR_TYPE>> {
-      public:
-        using self_type = BlockLiteral<STORAGE_TYPE, TENSOR_TYPE>;
-        using storage_type = STORAGE_TYPE;
-        using tensor_type = TENSOR_TYPE;
-        using result_type = TENSOR_TYPE;
-        using ref_type = typename STORAGE_TYPE::ref_type;
-        using tile_type = typename ref_type::tile_type;
-        using index_type = camp::idx_t;
-
-        static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
-
-
-      private:
-        storage_type m_storage;
-        tile_type m_tile_origin;
-
-      public:
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return storage_type::s_dim_elem(dim);
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        BlockLiteral(tile_type tile_origin) :
-          m_storage(),
-          m_tile_origin(tile_origin)
-        {
+namespace ET
+{
 
-        }
 
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        result_type eval(TILE_TYPE const &tile) const {
-          result_type result;
+/*!
+ * Temporary n-dimensional memory.
+ *
+ * STORAGE_TYPE defines the memory storage
+ * TENSOR_TYPE defines what kind of tensor is returned by eval()
+ */
+template <typename STORAGE_TYPE, typename TENSOR_TYPE>
+class BlockLiteral
+    : public TensorExpressionBase<BlockLiteral<STORAGE_TYPE, TENSOR_TYPE>>
+{
+public:
+  using self_type    = BlockLiteral<STORAGE_TYPE, TENSOR_TYPE>;
+  using storage_type = STORAGE_TYPE;
+  using tensor_type  = TENSOR_TYPE;
+  using result_type  = TENSOR_TYPE;
+  using ref_type     = typename STORAGE_TYPE::ref_type;
+  using tile_type    = typename ref_type::tile_type;
+  using index_type   = camp::idx_t;
+
+  static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
+
+
+private:
+  storage_type m_storage;
+  tile_type m_tile_origin;
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
+  {
+    return storage_type::s_dim_elem(dim);
+  }
 
-          // load result from storage
-          result.load_ref(merge_ref_tile(m_storage.get_ref(), tile - m_tile_origin));
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr BlockLiteral(tile_type tile_origin)
+      : m_storage(), m_tile_origin(tile_origin)
+  {}
 
-          return result;
-        }
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const& tile) const
+  {
+    result_type result;
 
+    // load result from storage
+    result.load_ref(merge_ref_tile(m_storage.get_ref(), tile - m_tile_origin));
 
-        /*!
-         *  Returns a ref that points at this data, shifted by its origin
-         */
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        ref_type get_ref() {
+    return result;
+  }
 
-          // compute shifited origin ref
-          return shift_tile_origin(m_storage.get_ref(), m_tile_origin);
 
-        }
+  /*!
+   *  Returns a ref that points at this data, shifted by its origin
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  ref_type get_ref()
+  {
 
+    // compute shifited origin ref
+    return shift_tile_origin(m_storage.get_ref(), m_tile_origin);
+  }
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("BlockLiteral()");
-        }
 
-    };
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const { printf("BlockLiteral()"); }
+};
 
 
 //    /*
-//     * For TensorRegister nodes, we need to wrap this in a constant value ET node
+//     * For TensorRegister nodes, we need to wrap this in a constant value ET
+//     node
 //     */
 //    template<typename RHS>
 //    struct NormalizeOperandHelper<RHS,
-//    typename std::enable_if<std::is_base_of<RAJA::internal::TensorRegisterConcreteBase, RHS>::value>::type>
+//    typename
+//    std::enable_if<std::is_base_of<RAJA::internal::TensorRegisterConcreteBase,
+//    RHS>::value>::type>
 //    {
 //        using return_type = BlockLiteral<RHS>;
 //
@@ -134,10 +130,10 @@ namespace expt
 //        }
 //    };
 
-  } // namespace ET
+}  // namespace ET
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp b/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
index 3e96a63462..50ae0933c0 100644
--- a/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
@@ -28,7 +28,7 @@
 #include "RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp"
 
 
-//#define RAJA_DEBUG_PRINT_ET_AST
+// #define RAJA_DEBUG_PRINT_ET_AST
 
 namespace RAJA
 {
@@ -38,128 +38,121 @@ namespace expt
 {
 
 
-    class TensorRegisterConcreteBase;
+class TensorRegisterConcreteBase;
 
-  namespace ET
+namespace ET
+{
+
+//
+// forward decls
+//
+
+template <typename TENSOR_REGISTER_TYPE, typename REF_TYPE>
+class TensorLoadStore;
+
+
+template <typename LHS_TYPE, typename RHS_TYPE>
+class TensorMultiply;
+
+template <typename LHS_TYPE, typename RHS_TYPE>
+class TensorDivide;
+
+template <typename TENSOR_TYPE>
+class TensorNegate;
+
+template <typename TENSOR_TYPE>
+class TensorTranspose;
+
+
+// provides a non-templated base-type for all ET's
+// this allows using things like std::is_base_of
+class TensorExpressionConcreteBase
+{};
+
+
+template <typename DERIVED_TYPE>
+class TensorExpressionBase : public TensorExpressionConcreteBase
+{
+public:
+  using self_type = DERIVED_TYPE;
+
+private:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type* getThis() { return static_cast<self_type*>(this); }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr self_type const* getThis() const
+  {
+    return static_cast<self_type const*>(this);
+  }
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr camp::idx_t getDimBegin(camp::idx_t) const { return 0; }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_INLINE RAJA_HOST_DEVICE TensorAdd<self_type, normalize_operand_t<RHS>>
+  operator+(RHS const& rhs) const
+  {
+    return TensorAdd<self_type, normalize_operand_t<RHS>>(
+        *getThis(), normalizeOperand(rhs));
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_INLINE
+      RAJA_HOST_DEVICE TensorSubtract<self_type, normalize_operand_t<RHS>>
+      operator-(RHS const& rhs) const
   {
+    return TensorSubtract<self_type, normalize_operand_t<RHS>>(
+        *getThis(), normalizeOperand(rhs));
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorNegate<self_type> operator-() const
+  {
+    return TensorNegate<self_type>(*getThis());
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_INLINE
+      RAJA_HOST_DEVICE TensorMultiply<self_type, normalize_operand_t<RHS>>
+      operator*(RHS const& rhs) const
+  {
+    return TensorMultiply<self_type, normalize_operand_t<RHS>>(
+        *getThis(), normalizeOperand(rhs));
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_INLINE RAJA_HOST_DEVICE TensorDivide<self_type, normalize_operand_t<RHS>>
+  operator/(RHS const& rhs) const
+  {
+    return TensorDivide<self_type, normalize_operand_t<RHS>>(
+        *getThis(), normalizeOperand(rhs));
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorTranspose<self_type> transpose() const
+  {
+    return TensorTranspose<self_type>(*getThis());
+  }
+};
+
+
+}  // namespace ET
 
-    //
-    // forward decls
-    //
-
-    template<typename TENSOR_REGISTER_TYPE, typename REF_TYPE>
-    class TensorLoadStore;
-
-
-    template<typename LHS_TYPE, typename RHS_TYPE>
-    class TensorMultiply;
-
-    template<typename LHS_TYPE, typename RHS_TYPE>
-    class TensorDivide;
-
-    template<typename TENSOR_TYPE>
-    class TensorNegate;
-
-    template<typename TENSOR_TYPE>
-    class TensorTranspose;
-
-
-
-
-    // provides a non-templated base-type for all ET's
-    // this allows using things like std::is_base_of
-    class TensorExpressionConcreteBase{};
-
-
-    template<typename DERIVED_TYPE>
-    class TensorExpressionBase :public TensorExpressionConcreteBase {
-      public:
-        using self_type = DERIVED_TYPE;
-
-      private:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        self_type *getThis(){
-          return static_cast<self_type*>(this);
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        self_type const *getThis() const {
-          return static_cast<self_type const*>(this);
-        }
-
-      public:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        camp::idx_t getDimBegin(camp::idx_t ) const
-        {
-          return 0;
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorAdd<self_type, normalize_operand_t<RHS> >
-        operator+(RHS const &rhs) const {
-          return TensorAdd<self_type, normalize_operand_t<RHS>>(*getThis(), normalizeOperand(rhs));
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorSubtract<self_type, normalize_operand_t<RHS>>
-        operator-(RHS const &rhs) const {
-          return TensorSubtract<self_type, normalize_operand_t<RHS>>(*getThis(), normalizeOperand(rhs));
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorNegate<self_type>
-        operator-() const {
-          return TensorNegate<self_type>(*getThis());
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorMultiply<self_type, normalize_operand_t<RHS>>
-        operator*(RHS const &rhs) const {
-          return TensorMultiply<self_type, normalize_operand_t<RHS>>(*getThis(), normalizeOperand(rhs));
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorDivide<self_type, normalize_operand_t<RHS>>
-        operator/(RHS const &rhs) const {
-          return TensorDivide<self_type, normalize_operand_t<RHS>>(*getThis(), normalizeOperand(rhs));
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorTranspose<self_type>
-        transpose() const {
-          return TensorTranspose<self_type>(*getThis());
-        }
-
-    };
-
-
-  } // namespace ET
-
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp b/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
index e7e7223ce4..6ea5d09aa9 100644
--- a/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
@@ -20,1210 +20,1232 @@
 #define RAJA_pattern_tensor_ET_MultiplyOperator_HPP
 
 
-
 namespace RAJA
 {
 namespace internal
 {
 namespace expt
 {
-  //forward
-  class TensorBlockConcreteBase;
+// forward
+class TensorBlockConcreteBase;
+
+
+namespace ET
+{
 
 
+/*!
+ * Provides default multiply, multiply add, and multiply subtract
+ * operations.
+ *
+ * If the operands are both matrices, we perform a matrix-matrix multiply.
+ * Otherwise, we perform element-wise operations.
+ */
+template <typename LEFT_OPERAND_TYPE,
+          typename RIGHT_OPERAND_TYPE,
+          class ENABLE = void>
+struct MultiplyOperator
+{
 
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
 
-  namespace ET
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast()
   {
+    printf("Elemental(%d,%d)", (int)s_num_dims,
+           (int)RIGHT_OPERAND_TYPE::s_num_dims);
+  }
 
 
-    /*!
-     * Provides default multiply, multiply add, and multiply subtract
-     * operations.
-     *
-     * If the operands are both matrices, we perform a matrix-matrix multiply.
-     * Otherwise, we perform element-wise operations.
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, class ENABLE = void>
-    struct MultiplyOperator
-    {
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const& left,
+                        RIGHT_OPERAND_TYPE const& right)
+  {
+    return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply(TILE_TYPE const& tile,
+           LEFT_OPERAND_TYPE const& left,
+           RIGHT_OPERAND_TYPE const& right)
+      -> decltype(left.eval(tile) * right.eval(tile))
+  {
+    return left.eval(tile) * right.eval(tile);
+  }
+
+
+  /*!
+   * Evaluate operands and perform element-wise multiply add
+   */
+  template <typename TILE_TYPE, typename ADD_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply_add(TILE_TYPE const& tile,
+               LEFT_OPERAND_TYPE const& left,
+               RIGHT_OPERAND_TYPE const& right,
+               ADD_OPERAND_TYPE const& add)
+      -> decltype(left.eval(tile).multiply_add(right.eval(tile),
+                                               add.eval(tile)))
+  {
+    return left.eval(tile).multiply_add(right.eval(tile), add.eval(tile));
+  }
+
+
+  /*!
+   * Evaluate operands and perform element-wise multiply subtract
+   */
+  template <typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply_subtract(TILE_TYPE const& tile,
+                    LEFT_OPERAND_TYPE const& left,
+                    RIGHT_OPERAND_TYPE const& right,
+                    SUBTRACT_OPERAND_TYPE const& subtract)
+      -> decltype(left.eval(tile).multiply_subtract(right.eval(tile),
+                                                    subtract.eval(tile)))
+  {
+    return left.eval(tile).multiply_subtract(right.eval(tile),
+                                             subtract.eval(tile));
+  }
+};
 
-        using result_type = typename LEFT_OPERAND_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Elemental(%d,%d)", (int)s_num_dims, (int)RIGHT_OPERAND_TYPE::s_num_dims);
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) {
-          return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
-        }
-
-        /*!
-         * Evaluate operands and perform element-wise multiply
-         */
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) ->
-          decltype(left.eval(tile) * right.eval(tile))
-        {
-          return left.eval(tile) * right.eval(tile);
-        }
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply add
-         */
-        template<typename TILE_TYPE, typename ADD_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_OPERAND_TYPE const &add) ->
-          decltype(left.eval(tile).multiply_add(right.eval(tile), add.eval(tile)))
-        {
-          return left.eval(tile).multiply_add(right.eval(tile), add.eval(tile));
-        }
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply subtract
-         */
-        template<typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_subtract(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, SUBTRACT_OPERAND_TYPE const &subtract) ->
-          decltype(left.eval(tile).multiply_subtract(right.eval(tile), subtract.eval(tile)))
-        {
-          return left.eval(tile).multiply_subtract(right.eval(tile), subtract.eval(tile));
-        }
-
-
-    };
-
-
-    /*!
-     * Specialization that provides multiplying a scalar * tensor
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
+
+/*!
+ * Specialization that provides multiplying a scalar * tensor
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
     typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0>::type>
-    {
+{
 
-        using result_type = typename RIGHT_OPERAND_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Scale");
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &right) {
-          return right.getDimSize(dim);
-        }
-
-        /*!
-         * Evaluate operands and perform scaling operation
-         */
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) ->
-          decltype(right.eval(tile).scale(left.eval(tile)))
-        {
-          return right.eval(tile).scale(left.eval(tile));
-        }
-
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply add
-         */
-        template<typename TILE_TYPE, typename ADD_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_OPERAND_TYPE const &add) ->
-          decltype(right.eval(tile).scale(left.eval(tile)) + add.eval(tile))
-        {
-          return right.eval(tile).scale(left.eval(tile)) + add.eval(tile);
-        }
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply subtract
-         */
-        template<typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_subtract(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, SUBTRACT_OPERAND_TYPE const &subtract) ->
-          decltype(right.eval(tile).scale(left.eval(tile)) - subtract.eval(tile))
-        {
-          return right.eval(tile).scale(left.eval(tile)) - subtract.eval(tile);
-        }
-    };
-
-
-    /*!
-     * Specialization that provides multiplying a tensor*scalar
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
+  using result_type = typename RIGHT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Scale"); }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const&, RIGHT_OPERAND_TYPE const& right)
+  {
+    return right.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform scaling operation
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply(TILE_TYPE const& tile,
+           LEFT_OPERAND_TYPE const& left,
+           RIGHT_OPERAND_TYPE const& right)
+      -> decltype(right.eval(tile).scale(left.eval(tile)))
+  {
+    return right.eval(tile).scale(left.eval(tile));
+  }
+
+
+  /*!
+   * Evaluate operands and perform element-wise multiply add
+   */
+  template <typename TILE_TYPE, typename ADD_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply_add(TILE_TYPE const& tile,
+               LEFT_OPERAND_TYPE const& left,
+               RIGHT_OPERAND_TYPE const& right,
+               ADD_OPERAND_TYPE const& add)
+      -> decltype(right.eval(tile).scale(left.eval(tile)) + add.eval(tile))
+  {
+    return right.eval(tile).scale(left.eval(tile)) + add.eval(tile);
+  }
+
+
+  /*!
+   * Evaluate operands and perform element-wise multiply subtract
+   */
+  template <typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply_subtract(TILE_TYPE const& tile,
+                    LEFT_OPERAND_TYPE const& left,
+                    RIGHT_OPERAND_TYPE const& right,
+                    SUBTRACT_OPERAND_TYPE const& subtract)
+      -> decltype(right.eval(tile).scale(left.eval(tile)) - subtract.eval(tile))
+  {
+    return right.eval(tile).scale(left.eval(tile)) - subtract.eval(tile);
+  }
+};
+
+
+/*!
+ * Specialization that provides multiplying a tensor*scalar
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
     typename std::enable_if<RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
-    {
+{
 
-        using result_type = typename LEFT_OPERAND_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Scale");
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-          return left.getDimSize(dim);
-        }
-
-        /*!
-         * Evaluate operands and perform scaling operation
-         */
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) ->
-          decltype(left.eval(tile).scale(right.eval(tile)))
-        {
-          return left.eval(tile).scale(right.eval(tile));
-        }
-
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply add
-         */
-        template<typename TILE_TYPE, typename ADD_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_OPERAND_TYPE const &add) ->
-          decltype(left.eval(tile).scale(right.eval(tile)) + add.eval(tile))
-        {
-          return left.eval(tile).scale(right.eval(tile)) + add.eval(tile);
-        }
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply subtract
-         */
-        template<typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_subtract(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, SUBTRACT_OPERAND_TYPE const &subtract) ->
-          decltype(left.eval(tile).scale(right.eval(tile)) - subtract.eval(tile))
-        {
-          return left.eval(tile).scale(right.eval(tile)) - subtract.eval(tile);
-        }
-    };
-
-
-    /*!
-     * Specialization for matrix-vector right multiplication.
-     *
-     * By default the A*x operator for two matrices produces a matrix-vector
-     * multiplication.
-     *
-     * The right hand side vector is always treated as a column vector.
-     *
-     * The resulting vector type is inherited from the RHS
-     *
-     *
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims==1>::type>
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Scale"); }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const& left, RIGHT_OPERAND_TYPE const&)
+  {
+    return left.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform scaling operation
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply(TILE_TYPE const& tile,
+           LEFT_OPERAND_TYPE const& left,
+           RIGHT_OPERAND_TYPE const& right)
+      -> decltype(left.eval(tile).scale(right.eval(tile)))
+  {
+    return left.eval(tile).scale(right.eval(tile));
+  }
+
+
+  /*!
+   * Evaluate operands and perform element-wise multiply add
+   */
+  template <typename TILE_TYPE, typename ADD_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply_add(TILE_TYPE const& tile,
+               LEFT_OPERAND_TYPE const& left,
+               RIGHT_OPERAND_TYPE const& right,
+               ADD_OPERAND_TYPE const& add)
+      -> decltype(left.eval(tile).scale(right.eval(tile)) + add.eval(tile))
+  {
+    return left.eval(tile).scale(right.eval(tile)) + add.eval(tile);
+  }
+
+
+  /*!
+   * Evaluate operands and perform element-wise multiply subtract
+   */
+  template <typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply_subtract(TILE_TYPE const& tile,
+                    LEFT_OPERAND_TYPE const& left,
+                    RIGHT_OPERAND_TYPE const& right,
+                    SUBTRACT_OPERAND_TYPE const& subtract)
+      -> decltype(left.eval(tile).scale(right.eval(tile)) - subtract.eval(tile))
+  {
+    return left.eval(tile).scale(right.eval(tile)) - subtract.eval(tile);
+  }
+};
+
+
+/*!
+ * Specialization for matrix-vector right multiplication.
+ *
+ * By default the A*x operator for two matrices produces a matrix-vector
+ * multiplication.
+ *
+ * The right hand side vector is always treated as a column vector.
+ *
+ * The resulting vector type is inherited from the RHS
+ *
+ *
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
+{
+
+  using left_type  = LEFT_OPERAND_TYPE;
+  using right_type = RIGHT_OPERAND_TYPE;
+  using result_type =
+      typename LEFT_OPERAND_TYPE::result_type::column_vector_type;
+  static constexpr camp::idx_t s_num_dims = 1;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Matrx*Vector"); }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const&, RIGHT_OPERAND_TYPE const& right)
+  {
+    return dim == 0 ? right.getDimSize(0) : 0;
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  multiply(TILE_TYPE const& tile,
+           LEFT_OPERAND_TYPE const& left,
+           RIGHT_OPERAND_TYPE const& right)
+  {
+
+    // clear result
+    result_type result(0);
+
+    // multiply left and right into result
+    multiply_into_result(result, tile, left, right);
+
+    return result;
+  }
+
+  template <typename TILE_TYPE, typename ADD_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  multiply_add(TILE_TYPE const& tile,
+               LEFT_OPERAND_TYPE const& left,
+               RIGHT_OPERAND_TYPE const& right,
+               ADD_TYPE const& add)
+  {
+
+    // evaluate add into result
+    result_type result = add.eval(tile);
+
+    // multiply left and right into result
+    multiply_into_result(result, tile, left, right);
+
+    return result;
+  }
+
+private:
+  template <typename STORAGE, typename TILE_TYPE, typename INDEX = void>
+  struct MultiplyBridge;
+
+  template <typename STORAGE, typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static void
+  multiply_into_result(STORAGE& result,
+                       TILE_TYPE const& tile,
+                       LEFT_OPERAND_TYPE const& et_left,
+                       RIGHT_OPERAND_TYPE const& et_right)
+  {
+    // using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
+
+    // get tile size from matrix type
+    auto tile_size = left_type::result_type::s_dim_elem(1);
+    auto k_size    = et_left.getDimSize(1);
+    // TODO: check that left and right are compatible
+    // m_left.getDimSize(1) == m_right.getDimSize(0)
+    // how do we provide checking for this kind of error?
+
+    // tile over row of left and column of right
+    auto left_tile =
+        LEFT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
+    left_tile.m_begin[0] = tile.m_begin[0];
+    left_tile.m_size[0]  = tile.m_size[0];
+    left_tile.m_size[1]  = tile_size;
+
+    using RightType = typename TILE_TYPE::nonstatic_self_type;
+
+    RightType right_tile = tile;
+    right_tile.m_size[0] = tile_size;
+
+    // Do full tiles in k
+    decltype(k_size) k = 0;
+    for (; k + tile_size <= k_size; k += tile_size)
     {
 
-      using left_type = LEFT_OPERAND_TYPE;
-      using right_type = RIGHT_OPERAND_TYPE;
-      using result_type = typename LEFT_OPERAND_TYPE::result_type::column_vector_type;
-      static constexpr camp::idx_t s_num_dims = 1;
+      // evaluate both sides of operator
+      left_tile.m_begin[1] = k;
+      auto left            = et_left.eval(left_tile);
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void print_ast() {
-        printf("Matrx*Vector");
-      }
+      right_tile.m_begin[0] = k;
+      auto right            = et_right.eval(right_tile);
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &right) {
-        return dim == 0 ? right.getDimSize(0) : 0;
-      }
+      // accumulate product
+      result = left.right_multiply_vector_accumulate(right, result);
+    }
+    // remainder tile in k
+    if (k < k_size)
+    {
+      auto& left_part_tile      = make_tensor_tile_partial(left_tile);
+      left_part_tile.m_begin[1] = k;
+      left_part_tile.m_size[1]  = k_size - k;
+      auto left                 = et_left.eval(left_part_tile);
+
+      auto& right_part_tile      = make_tensor_tile_partial(right_tile);
+      right_part_tile.m_begin[0] = k;
+      right_part_tile.m_size[0]  = k_size - k;
+      auto right                 = et_right.eval(right_part_tile);
+
+      // accumulate product of partial tile
+      result = left.right_multiply_vector_accumulate(right, result);
+    }
+  }
 
-      /*!
-       * Evaluate operands and perform element-wise multiply
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right){
 
-        // clear result
-        result_type result(0);
+  template <typename T>
+  struct Diag
+  {
+    static_assert(!std::is_same<T, void>::value, "diag");
+  };
 
-        // multiply left and right into result
-        multiply_into_result(result, tile, left, right);
+  template <typename I, TensorTileSize TTS, typename B, typename S>
+  struct Diag<StaticTensorTile<I, TTS, B, S>>
+  {
+    static_assert(std::is_same<I, void>::value, "diag");
+  };
 
-        return result;
-      }
+  template <typename STORAGE, typename TILE_TYPE, typename INDEX>
+  struct MultiplyBridge
+  {
+
+    Diag<TILE_TYPE> diag;
 
-      template<typename TILE_TYPE, typename ADD_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_TYPE const &add){
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void multiply_into_result(STORAGE& result,
+                                     TILE_TYPE const& tile,
+                                     LEFT_OPERAND_TYPE const& et_left,
+                                     RIGHT_OPERAND_TYPE const& et_right)
+    {
+      // using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
+
+      // get tile size from matrix type
+      auto tile_size = left_type::result_type::s_dim_elem(1);
+      auto k_size    = et_left.getDimSize(1);
+      // TODO: check that left and right are compatible
+      // m_left.getDimSize(1) == m_right.getDimSize(0)
+      // how do we provide checking for this kind of error?
+
+      // tile over row of left and column of right
+      auto left_tile =
+          LEFT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
+      left_tile.m_begin[0] = tile.m_begin[0];
+      left_tile.m_size[0]  = tile.m_size[0];
+      left_tile.m_size[1]  = tile_size;
+
+      using RightType = typename TILE_TYPE::nonstatic_self_type;
+
+      RightType right_tile = tile;
+      right_tile.m_size[0] = tile_size;
+
+      // Do full tiles in k
+      decltype(k_size) k = 0;
+      for (; k + tile_size <= k_size; k += tile_size)
+      {
 
-        // evaluate add into result
-        result_type result = add.eval(tile);
+        // evaluate both sides of operator
+        left_tile.m_begin[1] = k;
+        auto left            = et_left.eval(left_tile);
 
-        // multiply left and right into result
-        multiply_into_result(result, tile, left, right);
+        right_tile.m_begin[0] = k;
+        auto right            = et_right.eval(right_tile);
 
-        return result;
+        // accumulate product
+        result = left.right_multiply_vector_accumulate(right, result);
       }
+      // remainder tile in k
+      if (k < k_size)
+      {
+        auto& left_part_tile      = make_tensor_tile_partial(left_tile);
+        left_part_tile.m_begin[1] = k;
+        left_part_tile.m_size[1]  = k_size - k;
+        auto left                 = et_left.eval(left_part_tile);
+
+        auto& right_part_tile      = make_tensor_tile_partial(right_tile);
+        right_part_tile.m_begin[0] = k;
+        right_part_tile.m_size[0]  = k_size - k;
+        auto right                 = et_right.eval(right_part_tile);
+
+        // accumulate product of partial tile
+        result = left.right_multiply_vector_accumulate(right, result);
+      }
+    }
+  };
+
+
+  template <size_t INDEX,
+            typename STORAGE,
+            typename INDEX_TYPE,
+            TensorTileSize TENSOR_SIZE,
+            INDEX_TYPE Begin0,
+            INDEX_TYPE... BeginTail,
+            INDEX_TYPE Size0,
+            INDEX_TYPE... SizeTail>
+  struct MultiplyBridge<
+      STORAGE,
+      StaticTensorTile<INDEX_TYPE,
+                       TENSOR_SIZE,
+                       camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                       camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
+      camp::integral_constant<size_t, INDEX>>
+  {
+
+    using TileType =
+        StaticTensorTile<INDEX_TYPE,
+                         TENSOR_SIZE,
+                         camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                         camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
+
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void multiply_into_result(STORAGE& result,
+                                     TileType const& tile,
+                                     LEFT_OPERAND_TYPE const& et_left,
+                                     RIGHT_OPERAND_TYPE const& et_right)
+    {
 
-    private:
+      // get tile size from matrix type
+      const auto tile_size = left_type::result_type::s_dim_elem(1);
+      const auto k_size    = et_left.getDimSize(1);
 
-      template<typename STORAGE, typename TILE_TYPE, typename INDEX=void>
-      struct MultiplyBridge;
+      auto const offset = INDEX * tile_size;
 
-      template<typename STORAGE, typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
+      if ((offset + tile_size) <= k_size)
       {
-        //using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
-
-        // get tile size from matrix type
-        auto tile_size = left_type::result_type::s_dim_elem(1);
-        auto k_size = et_left.getDimSize(1);
-        // TODO: check that left and right are compatible
-        // m_left.getDimSize(1) == m_right.getDimSize(0)
-        // how do we provide checking for this kind of error?
-
-        // tile over row of left and column of right
-        auto left_tile = LEFT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
-        left_tile.m_begin[0] = tile.m_begin[0];
-        left_tile.m_size[0] = tile.m_size[0];
-        left_tile.m_size[1] = tile_size;
-
-        using RightType = typename TILE_TYPE::nonstatic_self_type;
-
-        RightType right_tile = tile;
-        right_tile.m_size[0] = tile_size;
-
-        // Do full tiles in k
-        decltype(k_size) k = 0;
-        for(;k+tile_size <= k_size; k+= tile_size){
-
-          // evaluate both sides of operator
-          left_tile.m_begin[1] = k;
-          auto left = et_left.eval(left_tile);
-
-          right_tile.m_begin[0] = k;
-          auto right = et_right.eval(right_tile);
-
-          // accumulate product
-          result = left.right_multiply_vector_accumulate(right, result);
-        }
-        // remainder tile in k
-        if(k < k_size){
-          auto &left_part_tile = make_tensor_tile_partial(left_tile);
-          left_part_tile.m_begin[1] = k;
-          left_part_tile.m_size[1] = k_size-k;
-          auto left = et_left.eval(left_part_tile);
-
-          auto &right_part_tile = make_tensor_tile_partial(right_tile);
-          right_part_tile.m_begin[0] = k;
-          right_part_tile.m_size[0] = k_size-k;
-          auto right = et_right.eval(right_part_tile);
-
-          // accumulate product of partial tile
-          result = left.right_multiply_vector_accumulate(right, result);
-        }
 
+        using LeftType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_SIZE,
+                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
+                             camp::int_seq<INDEX_TYPE, Size0, tile_size>>;
+        // evaluate both sides of operator
+        auto left = et_left.eval(LeftType());
+
+        using RightType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_SIZE,
+                             camp::int_seq<INDEX_TYPE, offset>,
+                             camp::int_seq<INDEX_TYPE, tile_size>>;
+
+        auto right = et_right.eval(RightType());
+
+        // accumulate product
+        auto temp = left.right_multiply_vector_accumulate(right, result);
+        MultiplyBridge<STORAGE, TileType,
+                       camp::integral_constant<size_t, INDEX - 1>>::
+            multiply_into_result(result, tile, et_left, et_right);
+        result += temp;
       }
+      else
+      {
 
+        using LeftType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL,
+                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
+                             camp::int_seq<INDEX_TYPE, Size0, k_size - offset>>;
+        auto left = et_left.eval(LeftType());
 
-      template<typename T>
-      struct Diag{
-          static_assert(!std::is_same<T,void>::value,"diag");
-      };
-
-      template<typename I, TensorTileSize TTS, typename B, typename S>
-      struct Diag< StaticTensorTile<I,TTS,B,S> >{
-          static_assert(std::is_same<I,void>::value,"diag");
-      };
-
-      template<typename STORAGE, typename TILE_TYPE, typename INDEX>
-      struct MultiplyBridge {
-
-          Diag<TILE_TYPE> diag;
-
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static
-          void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-          {
-            //using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
-    
-            // get tile size from matrix type
-            auto tile_size = left_type::result_type::s_dim_elem(1);
-            auto k_size = et_left.getDimSize(1);
-            // TODO: check that left and right are compatible
-            // m_left.getDimSize(1) == m_right.getDimSize(0)
-            // how do we provide checking for this kind of error?
-    
-            // tile over row of left and column of right
-            auto left_tile = LEFT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
-            left_tile.m_begin[0] = tile.m_begin[0];
-            left_tile.m_size[0] = tile.m_size[0];
-            left_tile.m_size[1] = tile_size;
-    
-            using RightType = typename TILE_TYPE::nonstatic_self_type;
-
-            RightType right_tile = tile;
-            right_tile.m_size[0] = tile_size;
-    
-            // Do full tiles in k
-            decltype(k_size) k = 0;
-            for(;k+tile_size <= k_size; k+= tile_size){
-    
-              // evaluate both sides of operator
-              left_tile.m_begin[1] = k;
-              auto left = et_left.eval(left_tile);
-    
-              right_tile.m_begin[0] = k;
-              auto right = et_right.eval(right_tile);
-    
-              // accumulate product
-              result = left.right_multiply_vector_accumulate(right, result);
-            }
-            // remainder tile in k
-            if(k < k_size){
-              auto &left_part_tile = make_tensor_tile_partial(left_tile);
-              left_part_tile.m_begin[1] = k;
-              left_part_tile.m_size[1] = k_size-k;
-              auto left = et_left.eval(left_part_tile);
-    
-              auto &right_part_tile = make_tensor_tile_partial(right_tile);
-              right_part_tile.m_begin[0] = k;
-              right_part_tile.m_size[0] = k_size-k;
-              auto right = et_right.eval(right_part_tile);
-    
-              // accumulate product of partial tile
-              result = left.right_multiply_vector_accumulate(right, result);
-            }
-    
-          }
-      };
-
-
-
-
-      template<
-          size_t INDEX,
-          typename STORAGE,
-          typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          INDEX_TYPE Begin0, INDEX_TYPE... BeginTail,
-          INDEX_TYPE  Size0, INDEX_TYPE... SizeTail
-      >
-      struct MultiplyBridge <
-          STORAGE,
-          StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >,
-          camp::integral_constant<size_t,INDEX>
-      > {
-
-          using TileType = StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >;
-              
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static
-          void multiply_into_result(STORAGE &result, TileType const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-          {
-
-              // get tile size from matrix type
-              const auto tile_size = left_type::result_type::s_dim_elem(1);
-              const auto k_size = et_left.getDimSize(1);
-             
-              auto const offset = INDEX*tile_size;
-
-              if( (offset + tile_size) <= k_size ) {
-    
-                    using LeftType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_SIZE,
-                        camp::int_seq<INDEX_TYPE, Begin0,    offset>,
-                        camp::int_seq<INDEX_TYPE,  Size0, tile_size>
-                    >;
-                    // evaluate both sides of operator
-                    auto left = et_left.eval(LeftType());
-
-                    using RightType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_SIZE,
-                        camp::int_seq<INDEX_TYPE,    offset>,
-                        camp::int_seq<INDEX_TYPE, tile_size>
-                    >;
-    
-                    auto right = et_right.eval(RightType());
-    
-                    // accumulate product
-                    auto temp = left.right_multiply_vector_accumulate(right, result);
-                    MultiplyBridge<STORAGE,TileType,camp::integral_constant<size_t,INDEX-1>>::multiply_into_result(result,tile,et_left,et_right);
-                    result += temp;
-                    
-              } else {
-
-                    using LeftType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_PARTIAL,
-                        camp::int_seq<INDEX_TYPE, Begin0,        offset>,
-                        camp::int_seq<INDEX_TYPE,  Size0, k_size-offset>
-                    >;
-		    auto left = et_left.eval(LeftType());
-	    
-                    using RightType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_PARTIAL,
-                        camp::int_seq<INDEX_TYPE,        offset>,
-                        camp::int_seq<INDEX_TYPE, k_size-offset>
-                    >;
-		    auto right = et_right.eval(RightType());
-	    
-		    // accumulate product of partial tile
-		    result = left.right_multiply_vector_accumulate(right, result);
-
-              }
-
-
-            }
-          };
-
-
-      template<
-          typename STORAGE,
-          typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          INDEX_TYPE Begin0, INDEX_TYPE... BeginTail,
-          INDEX_TYPE  Size0, INDEX_TYPE...  SizeTail
-      >
-      struct MultiplyBridge <
-          STORAGE,
-          StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >,
-          camp::integral_constant<size_t,0>
-      > {
-
-          using TileType = StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >;
-              
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static
-          void multiply_into_result(STORAGE &result, TileType const &, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-          {
-
-              // get tile size from matrix type
-              const auto tile_size = left_type::result_type::s_dim_elem(1);
-              const auto k_size = et_left.getDimSize(1);
-             
-              auto const offset = 0;
-
-              if( (offset + tile_size) <= k_size ) {
-    
-                    using LeftType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_SIZE,
-                        camp::int_seq<INDEX_TYPE, Begin0,    offset>,
-                        camp::int_seq<INDEX_TYPE,  Size0, tile_size>
-                    >;
-                    // evaluate both sides of operator
-                    auto left = et_left.eval(LeftType());
-
-                    using RightType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_SIZE,
-                        camp::int_seq<INDEX_TYPE,    offset>,
-                        camp::int_seq<INDEX_TYPE, tile_size>
-                    >;
-    
-                    auto right = et_right.eval(RightType());
-    
-                    // accumulate product
-                    auto temp = left.right_multiply_vector_accumulate(right, result);
-                    result += temp;
-                    
-              } else {
-
-                    using LeftType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_PARTIAL,
-                        camp::int_seq<INDEX_TYPE, Begin0,        offset>,
-                        camp::int_seq<INDEX_TYPE,  Size0, k_size-offset>
-                    >;
-		    auto left = et_left.eval(LeftType());
-	    
-                    using RightType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_PARTIAL,
-                        camp::int_seq<INDEX_TYPE,        offset>,
-                        camp::int_seq<INDEX_TYPE, k_size-offset>
-                    >;
-		    auto right = et_right.eval(RightType());
-	    
-		    // accumulate product of partial tile
-		    result = left.right_multiply_vector_accumulate(right, result);
-
-              }
-
-
-            }
-          };
-
-      template<
-          typename STORAGE,
-          typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          INDEX_TYPE Begin0,  INDEX_TYPE... BeginTail,
-          INDEX_TYPE  Size0,  INDEX_TYPE... SizeTail
-      >
-      struct MultiplyBridge <
-          STORAGE,
-          StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >,
-          void
-      > {
-
-          using TileType = StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >;
-              
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static
-          void multiply_into_result(STORAGE &result, TileType const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-          {
-
-              const auto tile_size = left_type::result_type::s_dim_elem(1);
-              const auto k_size = et_left.getDimSize(1);
-              const size_t iter_count = (k_size/tile_size) + ( (k_size%tile_size != 0) ? 1 : 0 );
-
-              MultiplyBridge<STORAGE,TileType,camp::integral_constant<size_t,iter_count>>::multiply_into_result(result,tile,et_left,et_right);
-
-            }
-          };
-
-      };
-
-
-
-
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, typename ADD_OPERAND_TYPE>
-    class TensorMultiplyAdd;
-
-
-    /*!
-     * Specialization for vector*matrix left multiplication.
-     *
-     * By default the x'*A operator for two matrices produces a vector-matrix
-     * multiplication.
-     *
-     * The left hand side vector is always treated as a row vector.
-     *
-     * The resulting vector type is inherited from the LHS
-     *
-     *
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 && RIGHT_OPERAND_TYPE::s_num_dims==2>::type>
+        using RightType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL,
+                             camp::int_seq<INDEX_TYPE, offset>,
+                             camp::int_seq<INDEX_TYPE, k_size - offset>>;
+        auto right = et_right.eval(RightType());
+
+        // accumulate product of partial tile
+        result = left.right_multiply_vector_accumulate(right, result);
+      }
+    }
+  };
+
+
+  template <typename STORAGE,
+            typename INDEX_TYPE,
+            TensorTileSize TENSOR_SIZE,
+            INDEX_TYPE Begin0,
+            INDEX_TYPE... BeginTail,
+            INDEX_TYPE Size0,
+            INDEX_TYPE... SizeTail>
+  struct MultiplyBridge<
+      STORAGE,
+      StaticTensorTile<INDEX_TYPE,
+                       TENSOR_SIZE,
+                       camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                       camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
+      camp::integral_constant<size_t, 0>>
+  {
+
+    using TileType =
+        StaticTensorTile<INDEX_TYPE,
+                         TENSOR_SIZE,
+                         camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                         camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
+
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void multiply_into_result(STORAGE& result,
+                                     TileType const&,
+                                     LEFT_OPERAND_TYPE const& et_left,
+                                     RIGHT_OPERAND_TYPE const& et_right)
     {
 
-      using left_type = LEFT_OPERAND_TYPE;
-      using right_type = RIGHT_OPERAND_TYPE;
-      using result_type = typename RIGHT_OPERAND_TYPE::result_type::row_vector_type;
-      static constexpr camp::idx_t s_num_dims = 1;
+      // get tile size from matrix type
+      const auto tile_size = left_type::result_type::s_dim_elem(1);
+      const auto k_size    = et_left.getDimSize(1);
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void print_ast() {
-        printf("Vector*Matrix");
-      }
+      auto const offset = 0;
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-        return dim == 0 ? left.getDimSize(0) : 0;
-      }
+      if ((offset + tile_size) <= k_size)
+      {
+
+        using LeftType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_SIZE,
+                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
+                             camp::int_seq<INDEX_TYPE, Size0, tile_size>>;
+        // evaluate both sides of operator
+        auto left = et_left.eval(LeftType());
 
-      /*!
-       * Evaluate operands and perform element-wise multiply
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right){
-        // clear result
-        result_type result(0);
-
-        // multiply left and right into result
-        multiply_into_result(result, tile, left, right);
-
-        return result;
+        using RightType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_SIZE,
+                             camp::int_seq<INDEX_TYPE, offset>,
+                             camp::int_seq<INDEX_TYPE, tile_size>>;
+
+        auto right = et_right.eval(RightType());
+
+        // accumulate product
+        auto temp = left.right_multiply_vector_accumulate(right, result);
+        result += temp;
       }
+      else
+      {
 
-      template<typename TILE_TYPE, typename ADD_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_TYPE const &add){
-        // evaluate add into result
-        result_type result = add.eval(tile);
+        using LeftType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL,
+                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
+                             camp::int_seq<INDEX_TYPE, Size0, k_size - offset>>;
+        auto left = et_left.eval(LeftType());
 
-        // multiply left and right into result
-        multiply_into_result(result, tile, left, right);
+        using RightType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL,
+                             camp::int_seq<INDEX_TYPE, offset>,
+                             camp::int_seq<INDEX_TYPE, k_size - offset>>;
+        auto right = et_right.eval(RightType());
 
-        return result;
+        // accumulate product of partial tile
+        result = left.right_multiply_vector_accumulate(right, result);
       }
+    }
+  };
+
+  template <typename STORAGE,
+            typename INDEX_TYPE,
+            TensorTileSize TENSOR_SIZE,
+            INDEX_TYPE Begin0,
+            INDEX_TYPE... BeginTail,
+            INDEX_TYPE Size0,
+            INDEX_TYPE... SizeTail>
+  struct MultiplyBridge<
+      STORAGE,
+      StaticTensorTile<INDEX_TYPE,
+                       TENSOR_SIZE,
+                       camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                       camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
+      void>
+  {
 
-    private:
-      template<typename STORAGE, typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-      {
-        // get tile size from matrix type
-        auto tile_size = right_type::result_type::s_dim_elem(0);
-        auto k_size = et_right.getDimSize(0);
+    using TileType =
+        StaticTensorTile<INDEX_TYPE,
+                         TENSOR_SIZE,
+                         camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                         camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
+
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void multiply_into_result(STORAGE& result,
+                                     TileType const& tile,
+                                     LEFT_OPERAND_TYPE const& et_left,
+                                     RIGHT_OPERAND_TYPE const& et_right)
+    {
 
+      const auto tile_size = left_type::result_type::s_dim_elem(1);
+      const auto k_size    = et_left.getDimSize(1);
+      const size_t iter_count =
+          (k_size / tile_size) + ((k_size % tile_size != 0) ? 1 : 0);
 
-        // TODO: check that left and right are compatible
-        // m_left.getDimSize(1) == m_right.getDimSize(0)
-        // how do we provide checking for this kind of error?
+      MultiplyBridge<STORAGE, TileType,
+                     camp::integral_constant<size_t, iter_count>>::
+          multiply_into_result(result, tile, et_left, et_right);
+    }
+  };
+};
 
-        // tile over row of left and column of right
-        auto right_tile = RIGHT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
-        right_tile.m_begin[1] = tile.m_begin[0];
-        right_tile.m_size[1] = tile.m_size[0];
-        right_tile.m_size[0] = tile_size;
 
-        TILE_TYPE left_tile = tile;
-        left_tile.m_size[0] = tile_size;
+template <typename LEFT_OPERAND_TYPE,
+          typename RIGHT_OPERAND_TYPE,
+          typename ADD_OPERAND_TYPE>
+class TensorMultiplyAdd;
 
 
-        // Do full tiles in k
-        decltype(k_size) k = 0;
-        for(;k+tile_size <= k_size; k+= tile_size){
+/*!
+ * Specialization for vector*matrix left multiplication.
+ *
+ * By default the x'*A operator for two matrices produces a vector-matrix
+ * multiplication.
+ *
+ * The left hand side vector is always treated as a row vector.
+ *
+ * The resulting vector type is inherited from the LHS
+ *
+ *
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+{
 
-          // evaluate both sides of operator
-          right_tile.m_begin[0] = k;
-          auto right = et_right.eval(right_tile);
+  using left_type   = LEFT_OPERAND_TYPE;
+  using right_type  = RIGHT_OPERAND_TYPE;
+  using result_type = typename RIGHT_OPERAND_TYPE::result_type::row_vector_type;
+  static constexpr camp::idx_t s_num_dims = 1;
 
-          left_tile.m_begin[0] = k;
-          auto left = et_left.eval(left_tile);
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Vector*Matrix"); }
 
-          // accumulate product
-          result = right.left_multiply_vector_accumulate(left, result);
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const& left, RIGHT_OPERAND_TYPE const&)
+  {
+    return dim == 0 ? left.getDimSize(0) : 0;
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  multiply(TILE_TYPE const& tile,
+           LEFT_OPERAND_TYPE const& left,
+           RIGHT_OPERAND_TYPE const& right)
+  {
+    // clear result
+    result_type result(0);
 
-        }
-        // remainder tile in k
-        if(k < k_size){
-          auto &right_part_tile = make_tensor_tile_partial(right_tile);
-          right_part_tile.m_begin[0] = k;
-          right_part_tile.m_size[0] = k_size-k;
-          auto right = et_right.eval(right_part_tile);
+    // multiply left and right into result
+    multiply_into_result(result, tile, left, right);
 
-          auto &left_part_tile = make_tensor_tile_partial(left_tile);
-          left_part_tile.m_begin[0] = k;
-          left_part_tile.m_size[0] = k_size-k;
-          auto left = et_left.eval(left_part_tile);
+    return result;
+  }
 
-          // compute product into x of partial tile
-          result = right.left_multiply_vector_accumulate(left, result);
-        }
+  template <typename TILE_TYPE, typename ADD_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  multiply_add(TILE_TYPE const& tile,
+               LEFT_OPERAND_TYPE const& left,
+               RIGHT_OPERAND_TYPE const& right,
+               ADD_TYPE const& add)
+  {
+    // evaluate add into result
+    result_type result = add.eval(tile);
+
+    // multiply left and right into result
+    multiply_into_result(result, tile, left, right);
+
+    return result;
+  }
+
+private:
+  template <typename STORAGE, typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static void
+  multiply_into_result(STORAGE& result,
+                       TILE_TYPE const& tile,
+                       LEFT_OPERAND_TYPE const& et_left,
+                       RIGHT_OPERAND_TYPE const& et_right)
+  {
+    // get tile size from matrix type
+    auto tile_size = right_type::result_type::s_dim_elem(0);
+    auto k_size    = et_right.getDimSize(0);
 
-      }
 
-    };
+    // TODO: check that left and right are compatible
+    // m_left.getDimSize(1) == m_right.getDimSize(0)
+    // how do we provide checking for this kind of error?
 
+    // tile over row of left and column of right
+    auto right_tile =
+        RIGHT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
+    right_tile.m_begin[1] = tile.m_begin[0];
+    right_tile.m_size[1]  = tile.m_size[0];
+    right_tile.m_size[0]  = tile_size;
 
+    TILE_TYPE left_tile = tile;
+    left_tile.m_size[0] = tile_size;
 
-    /*!
-     * Specialization for matrix-matrix multiplication for TensorRegisters
+
+    // Do full tiles in k
+    decltype(k_size) k = 0;
+    for (; k + tile_size <= k_size; k += tile_size)
+    {
+
+      // evaluate both sides of operator
+      right_tile.m_begin[0] = k;
+      auto right            = et_right.eval(right_tile);
+
+      left_tile.m_begin[0] = k;
+      auto left            = et_left.eval(left_tile);
+
+      // accumulate product
+      result = right.left_multiply_vector_accumulate(left, result);
+    }
+    // remainder tile in k
+    if (k < k_size)
+    {
+      auto& right_part_tile      = make_tensor_tile_partial(right_tile);
+      right_part_tile.m_begin[0] = k;
+      right_part_tile.m_size[0]  = k_size - k;
+      auto right                 = et_right.eval(right_part_tile);
+
+      auto& left_part_tile      = make_tensor_tile_partial(left_tile);
+      left_part_tile.m_begin[0] = k;
+      left_part_tile.m_size[0]  = k_size - k;
+      auto left                 = et_left.eval(left_part_tile);
+
+      // compute product into x of partial tile
+      result = right.left_multiply_vector_accumulate(left, result);
+    }
+  }
+};
+
+
+/*!
+ * Specialization for matrix-matrix multiplication for TensorRegisters
+ *
+ * By default the A*B operator for two matrices produces a matrix-matrix
+ * multiplication.
+ *
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+{
+
+  using left_type   = LEFT_OPERAND_TYPE;
+  using right_type  = RIGHT_OPERAND_TYPE;
+  using result_type = typename LEFT_OPERAND_TYPE::result_type::product_type;
+  static constexpr camp::idx_t s_num_dims = 2;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Matrx*Matrix"); }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const& left,
+                        RIGHT_OPERAND_TYPE const& right)
+  {
+    return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  multiply(TILE_TYPE const& tile,
+           LEFT_OPERAND_TYPE const& left,
+           RIGHT_OPERAND_TYPE const& right)
+  {
+
+    /*
+     *
+     * For TensorRegister:
+     *
+     *   Return's a register containing product of left and right operands
+     *
+     * For TensorBlock:
+     *
+     *  Return's an ET TensorLiteral containing the left and right operrands
+     *
+     *  OR
      *
-     * By default the A*B operator for two matrices produces a matrix-matrix
-     * multiplication.
+     *  Returns an ET multiply
      *
      */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<
-    LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims==2>::type>
-    {
+    // create zeroed temporary
+    result_type result;
+    result.broadcast(0);
+
+    // multiply left and right operands into temporary
+    multiply_into_result(result, tile, left, right);
+
+    return result;
+  }
+
+  template <typename TILE_TYPE, typename ADD_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  multiply_add(TILE_TYPE const& tile,
+               LEFT_OPERAND_TYPE const& left,
+               RIGHT_OPERAND_TYPE const& right,
+               ADD_TYPE const& add)
+  {
 
-      using left_type = LEFT_OPERAND_TYPE;
-      using right_type = RIGHT_OPERAND_TYPE;
-      using result_type = typename LEFT_OPERAND_TYPE::result_type::product_type;
-      static constexpr camp::idx_t s_num_dims = 2;
+    // start accumulator with addition term
+    result_type result = add.eval(tile);
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void print_ast() {
-        printf("Matrx*Matrix");
-      }
+    multiply_into_result(result, tile, left, right);
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) {
-        return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
-      }
+    return result;
+  }
 
-      /*!
-       * Evaluate operands and perform element-wise multiply
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
+private:
+  template <typename STORAGE, typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static void
+  multiply_into_result(STORAGE& result,
+                       TILE_TYPE const& tile,
+                       LEFT_OPERAND_TYPE const& et_left,
+                       RIGHT_OPERAND_TYPE const& et_right)
+  {
+    // get tile size from matrix type
+    using right_tensor_type = typename right_type::result_type;
+    auto tile_size          = right_tensor_type::s_dim_elem(0);
+    auto k_size             = et_left.getDimSize(1);
 
-        /*
-         *
-         * For TensorRegister:
-         *
-         *   Return's a register containing product of left and right operands
-         *
-         * For TensorBlock:
-         *
-         *  Return's an ET TensorLiteral containing the left and right operrands
-         *
-         *  OR
-         *
-         *  Returns an ET multiply
-         *
-         */
-        // create zeroed temporary
-        result_type result;
-        result.broadcast(0);
-
-        // multiply left and right operands into temporary
-        multiply_into_result(result, tile, left,right);
-
-        return result;
-      }
+    // TODO: check that left and right are compatible
+    // m_left.getDimSize(1) == m_right.getDimSize(0)
+    // how do we provide checking for this kind of error?
 
-      template<typename TILE_TYPE, typename ADD_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_TYPE const &add)
-      {
+    // tile over row of left and column of right
+    TILE_TYPE left_tile = tile;
+    left_tile.m_size[1] = tile_size;
+    auto left_begin     = et_left.getDimBegin(1);
 
-        // start accumulator with addition term
-        result_type result = add.eval(tile);
+    TILE_TYPE right_tile = tile;
+    right_tile.m_size[0] = tile_size;
+    auto right_begin     = et_right.getDimBegin(0);
 
-        multiply_into_result(result, tile, left, right);
 
-        return result;
+    // Do full tiles in k
+    decltype(k_size) k = 0;
+    for (; k + tile_size <= k_size; k += tile_size)
+    {
 
-      }
+      // evaluate both sides of operator
+      left_tile.m_begin[1] = k + left_begin;
+      auto left            = et_left.eval(left_tile);
 
-    private:
-      template<typename STORAGE, typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-      {
-        // get tile size from matrix type
-        using right_tensor_type = typename right_type::result_type;
-        auto tile_size = right_tensor_type::s_dim_elem(0);
-        auto k_size = et_left.getDimSize(1);
-
-        // TODO: check that left and right are compatible
-        // m_left.getDimSize(1) == m_right.getDimSize(0)
-        // how do we provide checking for this kind of error?
-
-        // tile over row of left and column of right
-        TILE_TYPE left_tile = tile;
-        left_tile.m_size[1] = tile_size;
-        auto left_begin = et_left.getDimBegin(1);
-
-        TILE_TYPE right_tile = tile;
-        right_tile.m_size[0] = tile_size;
-        auto right_begin = et_right.getDimBegin(0);
-
-
-        // Do full tiles in k
-        decltype(k_size) k = 0;
-        for(;k+tile_size <= k_size; k+= tile_size){
-
-          // evaluate both sides of operator
-          left_tile.m_begin[1] = k + left_begin;
-          auto left = et_left.eval(left_tile);
-
-          right_tile.m_begin[0] = k + right_begin;
-          auto right = et_right.eval(right_tile);
-
-          // accumulate product
-          left.matrix_multiply_accumulate(result, right);
-        }
-        // remainder tile in k
-        if(k < k_size){
-
-          auto &left_part_tile = make_tensor_tile_partial(left_tile);
-          left_part_tile.m_begin[1] = k + left_begin;
-          left_part_tile.m_size[1] = k_size-k;
-          auto left = et_left.eval(left_part_tile);
-
-          auto &right_part_tile = make_tensor_tile_partial(right_tile);
-          right_part_tile.m_begin[0] = k + right_begin;
-          right_part_tile.m_size[0] = k_size-k;
-          auto right = et_right.eval(right_part_tile);
-
-          // accumulate product
-          left.matrix_multiply_accumulate(result, right);
-        }
-      }
+      right_tile.m_begin[0] = k + right_begin;
+      auto right            = et_right.eval(right_tile);
 
-    };
+      // accumulate product
+      left.matrix_multiply_accumulate(result, right);
+    }
+    // remainder tile in k
+    if (k < k_size)
+    {
 
+      auto& left_part_tile      = make_tensor_tile_partial(left_tile);
+      left_part_tile.m_begin[1] = k + left_begin;
+      left_part_tile.m_size[1]  = k_size - k;
+      auto left                 = et_left.eval(left_part_tile);
 
+      auto& right_part_tile      = make_tensor_tile_partial(right_tile);
+      right_part_tile.m_begin[0] = k + right_begin;
+      right_part_tile.m_size[0]  = k_size - k;
+      auto right                 = et_right.eval(right_part_tile);
 
+      // accumulate product
+      left.matrix_multiply_accumulate(result, right);
+    }
+  }
+};
 
 
-    template<typename OPERAND_TYPE, typename TILE_TYPE>
-    class RestrictExtents : public TensorExpressionBase<RestrictExtents<OPERAND_TYPE, TILE_TYPE>> {
-      public:
-        using self_type = RestrictExtents<OPERAND_TYPE, TILE_TYPE>;
-        using operand_type = OPERAND_TYPE;
-        using result_type = typename OPERAND_TYPE::result_type;
-        using index_type = typename TILE_TYPE::index_type;
-        using tile_type = TILE_TYPE;
-        static constexpr camp::idx_t s_num_dims = OPERAND_TYPE::s_num_dims;
+template <typename OPERAND_TYPE, typename TILE_TYPE>
+class RestrictExtents
+    : public TensorExpressionBase<RestrictExtents<OPERAND_TYPE, TILE_TYPE>>
+{
+public:
+  using self_type    = RestrictExtents<OPERAND_TYPE, TILE_TYPE>;
+  using operand_type = OPERAND_TYPE;
+  using result_type  = typename OPERAND_TYPE::result_type;
+  using index_type   = typename TILE_TYPE::index_type;
+  using tile_type    = TILE_TYPE;
+  static constexpr camp::idx_t s_num_dims = OPERAND_TYPE::s_num_dims;
+
+private:
+  operand_type m_operand;
+  tile_type m_tile;
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  RestrictExtents(operand_type const& operand, tile_type const& tile)
+      : m_operand {operand}, m_tile {tile}
+  {}
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
+  {
+    return m_tile.m_size[dim];
+  }
 
-      private:
-        operand_type m_operand;
-        tile_type m_tile;
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimBegin(camp::idx_t dim) const
+  {
+    return m_tile.m_begin[dim];
+  }
 
-      public:
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        RestrictExtents(operand_type const &operand, tile_type const &tile) :
-        m_operand{operand}, m_tile{tile}
-        {}
+  template <typename TILE_TYPE2>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE2 const& tile) const
+      -> decltype(m_operand.eval(tile))
+  {
+    return m_operand.eval(tile);
+  }
 
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    printf("RestrictExtents(");
+    m_operand.print_ast();
+    printf(")");
+  }
+};
+
+template <typename OPERAND, typename TILE>
+RestrictExtents<OPERAND, TILE> restrictExtents(OPERAND const& operand,
+                                               TILE const& tile)
+{
+  using tile_type = typename OPERAND::tile_type;
+  tile_type new_tile;
+  new_tile.copy(tile);
+  return RestrictExtents<OPERAND, TILE>(operand, new_tile);
+}
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return m_tile.m_size[dim];
-        }
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimBegin(camp::idx_t dim) const {
-          return m_tile.m_begin[dim];
-        }
+/*!
+ * Specialization for matrix-matrix multiplication for TensorBlocks
+ *
+ * By default the A*B operator for two matrices produces a matrix-matrix
+ * multiplication.
+ *
+ */
 
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<
+        std::is_base_of<TensorBlockConcreteBase,
+                        typename RIGHT_OPERAND_TYPE::tensor_type>::value &&
+        LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+        RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+{
+  using left_type   = LEFT_OPERAND_TYPE;
+  using right_type  = RIGHT_OPERAND_TYPE;
+  using result_type = typename LEFT_OPERAND_TYPE::result_type::product_type;
+  static constexpr camp::idx_t s_num_dims = 2;
 
-        template<typename TILE_TYPE2>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval(TILE_TYPE2 const &tile) const ->
-          decltype(m_operand.eval(tile))
-        {
-          return m_operand.eval(tile);
-        }
+  //      static_assert(LEFT_OPERAND_TYPE::s_num_dims == 1, "WHAOO");
+  //      static_assert(! std::is_base_of<TensorBlockConcreteBase, typename
+  //      RIGHT_OPERAND_TYPE::tensor_type>::value, "MATCH");
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("RestrictExtents(");
-          m_operand.print_ast();
-          printf(")");
-        }
 
+  // This tensor type is a TensorBlock of some kind
+  using tensor_type = typename RIGHT_OPERAND_TYPE::tensor_type;
 
-    };
+  // Get the storage type from the TensorBlock
+  using storage_type = typename tensor_type::storage_type;
 
-    template<typename OPERAND, typename TILE>
-    RestrictExtents<OPERAND, TILE> restrictExtents(OPERAND const &operand, TILE const &tile){
-      using tile_type = typename OPERAND::tile_type;
-      tile_type new_tile;
-      new_tile.copy(tile);
-      return RestrictExtents<OPERAND, TILE>(operand, new_tile);
-    }
+  // Create a BlockLiteral that uses the TensorBlock's indicated storage
+  // and has an eval() that produces the TensorBlock's register type
+  using block_literal =
+      BlockLiteral<storage_type, typename tensor_type::register_type>;
 
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Matrx*Matrix"); }
 
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const& left,
+                        RIGHT_OPERAND_TYPE const& right)
+  {
+    return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static block_literal multiply(
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const&,
+      RIGHT_OPERAND_TYPE const&)  //->
+                                  /// decltype(TensorMultiply<decltype(left.eval(tile)),
+                                  /// decltype(right.eval(tile))>(left.eval(tile),
+                                  /// right.eval(tile)))
+  {
 
-    /*!
-     * Specialization for matrix-matrix multiplication for TensorBlocks
+    /*
+     * First pass:  just return a Multiply ET that evaluates the block
+     * with underlying TensorRegisters
      *
-     * By default the A*B operator for two matrices produces a matrix-matrix
-     * multiplication.
+     *
+     * Second pass: we want to return a TensorLiteral ET node with the
+     * matrix product already evaluated.?
+     *
+     * What we really care about is improving the data reuse: so perhaps
+     * returning a Multiply ET node with TensorLiteral nodes for each
+     * of the operands
+     *
+     */
+    // create a BlockLiteral
+    block_literal result(tile);
+
+    // evaluate the block-wise product into result
+
+    // return TensorMultiply<decltype(left.eval(tile)),
+    // decltype(right.eval(tile))>(left.eval(tile), right.eval(tile));
+
+    // return the BlockLiterat ET
+    return result;
+  }
+
+  template <typename TILE_TYPE, typename ADD_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static block_literal multiply_add(
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const& left,
+      RIGHT_OPERAND_TYPE const& right,
+      ADD_TYPE const&
+          add)  //->
+                // decltype(TensorMultiplyAdd<decltype(left.eval(tile)),
+                // decltype(right.eval(tile)),
+                // decltype(add.eval(tile))>(left.eval(tile),
+                // right.eval(tile), add.eval(tile)))
+  {
+    /*
+     * First pass:  we want to return a BlockLiteral ET node with the
+     * matrix product already evaluated.  We do this by creating
+     * a LoadStore node wrapping the BlockLiteral, and evaluating it as
+     * a sub-expression.
+     *
+     * What we really care about is improving the data reuse: so perhaps
+     * returning a Multiply ET node with TensorLiteral nodes for each
+     * of the operands
      *
      */
 
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<
-    std::is_base_of<TensorBlockConcreteBase, typename RIGHT_OPERAND_TYPE::tensor_type>::value &&
-    LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims==2>::type>
+    // create a BlockLiteral
+    using block_tile_type = typename block_literal::tile_type;
+    block_tile_type block_tile;
+    block_tile.copy(tile);
+    block_literal result(block_tile);
+
+    using ref_type        = typename block_literal::ref_type;
+    using load_store_type = TensorLoadStore<tensor_type, ref_type>;
+
+    // initialize the result with our addition term
+    auto result_et = load_store_type(result.get_ref()).eval(tile);
+    result_et      = add.eval(tile);
+
+    // return TensorMultiplyAdd<decltype(left.eval(tile)),
+    // decltype(right.eval(tile)), decltype(add.eval(tile))>(left.eval(tile),
+    // right.eval(tile), add.eval(tile));
+
+    //          multiply_into_result(result_et, tile, restrictExtents(left,
+    //          tile), restrictExtents(right, tile));
+    multiply_into_result(result_et, tile, left, right);
+
+    // return the BlockLiterat ET
+    return result;
+  }
+
+private:
+  template <typename STORAGE, typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static void
+  multiply_into_result(STORAGE& result,
+                       TILE_TYPE const& tile,
+                       LEFT_OPERAND_TYPE const& et_left,
+                       RIGHT_OPERAND_TYPE const& et_right)
+  {
+
+    // get tile size from matrix type
+    auto tile_size = result_type::s_dim_elem(1);
+    auto k_size    = et_left.getDimSize(1);
+
+    // TODO: check that left and right are compatible
+    // m_left.getDimSize(1) == m_right.getDimSize(0)
+    // how do we provide checking for this kind of error?
+
+    // tile over row of left and column of right
+    TILE_TYPE left_tile = tile;
+    left_tile.m_size[1] = tile_size;
+    auto left_begin     = et_left.getDimBegin(1);
+
+    TILE_TYPE right_tile = tile;
+    right_tile.m_size[0] = tile_size;
+    auto right_begin     = et_right.getDimBegin(0);
+
+
+    // Do full tiles in k
+    decltype(k_size) k = 0;
+    for (; k + tile_size <= k_size; k += tile_size)
     {
-        using left_type = LEFT_OPERAND_TYPE;
-        using right_type = RIGHT_OPERAND_TYPE;
-        using result_type = typename LEFT_OPERAND_TYPE::result_type::product_type;
-        static constexpr camp::idx_t s_num_dims = 2;
 
-  //      static_assert(LEFT_OPERAND_TYPE::s_num_dims == 1, "WHAOO");
-  //      static_assert(! std::is_base_of<TensorBlockConcreteBase, typename RIGHT_OPERAND_TYPE::tensor_type>::value, "MATCH");
-
-
-        // This tensor type is a TensorBlock of some kind
-        using tensor_type = typename RIGHT_OPERAND_TYPE::tensor_type;
-
-        // Get the storage type from the TensorBlock
-        using storage_type = typename tensor_type::storage_type;
-
-        // Create a BlockLiteral that uses the TensorBlock's indicated storage
-        // and has an eval() that produces the TensorBlock's register type
-        using block_literal = BlockLiteral<storage_type,
-                                           typename tensor_type::register_type>;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Matrx*Matrix");
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) {
-          return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
-        }
-
-        /*!
-         * Evaluate operands and perform element-wise multiply
-         */
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        block_literal multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &) //->
-          ///decltype(TensorMultiply<decltype(left.eval(tile)), decltype(right.eval(tile))>(left.eval(tile), right.eval(tile)))
-        {
-
-          /*
-           * First pass:  just return a Multiply ET that evaluates the block
-           * with underlying TensorRegisters
-           *
-           *
-           * Second pass: we want to return a TensorLiteral ET node with the
-           * matrix product already evaluated.?
-           *
-           * What we really care about is improving the data reuse: so perhaps
-           * returning a Multiply ET node with TensorLiteral nodes for each
-           * of the operands
-           *
-           */
-          // create a BlockLiteral
-          block_literal result(tile);
-
-          // evaluate the block-wise product into result
-
-          //return TensorMultiply<decltype(left.eval(tile)), decltype(right.eval(tile))>(left.eval(tile), right.eval(tile));
-
-          // return the BlockLiterat ET
-          return result;
-        }
-
-        template<typename TILE_TYPE, typename ADD_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        block_literal multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_TYPE const &add) //->
-          //decltype(TensorMultiplyAdd<decltype(left.eval(tile)), decltype(right.eval(tile)), decltype(add.eval(tile))>(left.eval(tile), right.eval(tile), add.eval(tile)))
-        {
-          /*
-           * First pass:  we want to return a BlockLiteral ET node with the
-           * matrix product already evaluated.  We do this by creating
-           * a LoadStore node wrapping the BlockLiteral, and evaluating it as
-           * a sub-expression.
-           *
-           * What we really care about is improving the data reuse: so perhaps
-           * returning a Multiply ET node with TensorLiteral nodes for each
-           * of the operands
-           *
-           */
-
-          // create a BlockLiteral
-          using block_tile_type = typename block_literal::tile_type;
-          block_tile_type block_tile;
-          block_tile.copy(tile);
-          block_literal result(block_tile);
-
-          using ref_type = typename block_literal::ref_type;
-          using load_store_type = TensorLoadStore<tensor_type, ref_type>;
-
-          // initialize the result with our addition term
-          auto result_et = load_store_type(result.get_ref()).eval(tile);
-          result_et = add.eval(tile);
-
-          //return TensorMultiplyAdd<decltype(left.eval(tile)), decltype(right.eval(tile)), decltype(add.eval(tile))>(left.eval(tile), right.eval(tile), add.eval(tile));
-
-//          multiply_into_result(result_et, tile, restrictExtents(left, tile), restrictExtents(right, tile));
-          multiply_into_result(result_et, tile, left, right);
-
-          // return the BlockLiterat ET
-          return result;
-        }
-
-      private:
-
-        template<typename STORAGE, typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-        {
-
-          // get tile size from matrix type
-          auto tile_size = result_type::s_dim_elem(1);
-          auto k_size = et_left.getDimSize(1);
-
-          // TODO: check that left and right are compatible
-          // m_left.getDimSize(1) == m_right.getDimSize(0)
-          // how do we provide checking for this kind of error?
-
-          // tile over row of left and column of right
-          TILE_TYPE left_tile = tile;
-          left_tile.m_size[1] = tile_size;
-          auto left_begin = et_left.getDimBegin(1);
-
-          TILE_TYPE right_tile = tile;
-          right_tile.m_size[0] = tile_size;
-          auto right_begin = et_right.getDimBegin(0);
-
-
-
-          // Do full tiles in k
-          decltype(k_size) k = 0;
-          for(;k+tile_size <= k_size; k+= tile_size){
-
-
-            // evaluate both sides of operator
-            left_tile.m_begin[1] = k + left_begin;
-            auto left = et_left.eval(left_tile);
-
-            right_tile.m_begin[0] = k + right_begin;
-            auto right = et_right.eval(right_tile);
-
-            // accumulate product
-            //left.matrix_multiply_accumulate(result, right);
-            result += restrictExtents(left, left_tile) * restrictExtents(right, right_tile);
-          }
-          // remainder tile in k
-          if(k < k_size){
-
-            auto &left_part_tile = make_tensor_tile_partial(left_tile);
-            left_part_tile.m_begin[1] = k + left_begin;
-            left_part_tile.m_size[1] = k_size-k;
-            auto left = et_left.eval(left_part_tile);
-
-            auto &right_part_tile = make_tensor_tile_partial(right_tile);
-            right_part_tile.m_begin[0] = k + right_begin;
-            right_part_tile.m_size[0] = k_size-k;
-            auto right = et_right.eval(right_part_tile);
-
-            // accumulate product
-            //left.matrix_multiply_accumulate(result, right);
-            result += restrictExtents(left, left_part_tile) * restrictExtents(right, right_part_tile);
-          }
-        }
-    };
-
-
-  } // namespace ET
 
-  } // namespace internal
-} // namespace expt
+      // evaluate both sides of operator
+      left_tile.m_begin[1] = k + left_begin;
+      auto left            = et_left.eval(left_tile);
+
+      right_tile.m_begin[0] = k + right_begin;
+      auto right            = et_right.eval(right_tile);
+
+      // accumulate product
+      // left.matrix_multiply_accumulate(result, right);
+      result +=
+          restrictExtents(left, left_tile) * restrictExtents(right, right_tile);
+    }
+    // remainder tile in k
+    if (k < k_size)
+    {
+
+      auto& left_part_tile      = make_tensor_tile_partial(left_tile);
+      left_part_tile.m_begin[1] = k + left_begin;
+      left_part_tile.m_size[1]  = k_size - k;
+      auto left                 = et_left.eval(left_part_tile);
+
+      auto& right_part_tile      = make_tensor_tile_partial(right_tile);
+      right_part_tile.m_begin[0] = k + right_begin;
+      right_part_tile.m_size[0]  = k_size - k;
+      auto right                 = et_right.eval(right_part_tile);
+
+      // accumulate product
+      // left.matrix_multiply_accumulate(result, right);
+      result += restrictExtents(left, left_part_tile) *
+                restrictExtents(right, right_part_tile);
+    }
+  }
+};
+
+
+}  // namespace ET
+
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
index faa92747dd..34998af6bd 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
@@ -33,346 +33,381 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+template <typename LEFT_OPERAND_TYPE,
+          typename RIGHT_OPERAND_TYPE,
+          class ENABLE = void>
+struct DivideOperator;
+
+
+/*!
+ * Specialization that provides dividing a scalar by a vector
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
+{
+
+  using result_type = typename RIGHT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const&, RIGHT_OPERAND_TYPE const& right)
   {
+    return right.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  divide(TILE_TYPE const& tile,
+         LEFT_OPERAND_TYPE const& left,
+         RIGHT_OPERAND_TYPE const& right)
+  {
+    result_type numerator(left.eval(tile));
 
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, class ENABLE = void>
-    struct DivideOperator;
+    if (tile.s_tensor_size == TENSOR_FULL)
+    {
+      return numerator.divide(right.eval(tile));
+    }
+
+    return numerator.divide_n(right.eval(tile), tile.m_size[0]);
+  }
+};
+
+
+/*!
+ * Specialization that provides dividing a vector by a scalar
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
+{
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
 
 
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const& left, RIGHT_OPERAND_TYPE const&)
+  {
+    return left.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  divide(TILE_TYPE const& tile,
+         LEFT_OPERAND_TYPE const& left,
+         RIGHT_OPERAND_TYPE const& right)
+  {
+    result_type denominator(right.eval(tile));
 
-    /*!
-     * Specialization that provides dividing a scalar by a vector
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0 && RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
+    if (tile.s_tensor_size == TENSOR_FULL)
+    {
+      return left.eval(tile).divide(denominator);
+    }
+    else
     {
+      return left.eval(tile).divide_n(denominator, tile.m_size[0]);
+    }
+  }
+};
+
 
-      using result_type = typename RIGHT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &right) {
-        return right.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        result_type numerator(left.eval(tile));
-
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return numerator.divide(right.eval(tile));
-        }
-
-        return numerator.divide_n(right.eval(tile), tile.m_size[0]);
-
-      }
-    };
-
-
-    /*!
-     * Specialization that provides dividing a vector by a scalar
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 && RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
+/*!
+ * Specialization that provides dividing a vector by a vector
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
+{
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const& left, RIGHT_OPERAND_TYPE const&)
+  {
+    return left.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  divide(TILE_TYPE const& tile,
+         LEFT_OPERAND_TYPE const& left,
+         RIGHT_OPERAND_TYPE const& right)
+  {
+    if (tile.s_tensor_size == TENSOR_FULL)
     {
-      using result_type = typename LEFT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-        return left.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        result_type denominator(right.eval(tile));
-
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return left.eval(tile).divide(denominator);
-        }
-        else{
-          return left.eval(tile).divide_n(denominator, tile.m_size[0]);
-        }
-      }
-    };
-
-
-    /*!
-     * Specialization that provides dividing a vector by a vector
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 && RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
+      return left.eval(tile).divide(right.eval(tile));
+    }
+    else
     {
-      using result_type = typename LEFT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-        return left.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return left.eval(tile).divide(right.eval(tile));
-        }
-        else{
-          return left.eval(tile).divide_n(right.eval(tile), tile.m_size[0]);
-        }
-      }
-    };
-
-
-
-
-
-
-    /*!
-     * Specialization that provides dividing a scalar by a matrix
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0 && RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+      return left.eval(tile).divide_n(right.eval(tile), tile.m_size[0]);
+    }
+  }
+};
+
+
+/*!
+ * Specialization that provides dividing a scalar by a matrix
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+{
+
+  using result_type = typename RIGHT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const&, RIGHT_OPERAND_TYPE const& right)
+  {
+    return right.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  divide(TILE_TYPE const& tile,
+         LEFT_OPERAND_TYPE const& left,
+         RIGHT_OPERAND_TYPE const& right)
+  {
+    result_type numerator(left.eval(tile));
+
+    if (tile.s_tensor_size == TENSOR_FULL)
     {
+      return numerator.divide(right.eval(tile));
+    }
+
+    return numerator.divide_nm(right.eval(tile), tile.m_size[0],
+                               tile.m_size[1]);
+  }
+};
 
-      using result_type = typename RIGHT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &right) {
-        return right.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        result_type numerator(left.eval(tile));
-
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return numerator.divide(right.eval(tile));
-        }
-
-        return numerator.divide_nm(right.eval(tile), tile.m_size[0], tile.m_size[1]);
-
-      }
-    };
-
-
-    /*!
-     * Specialization that provides dividing a vector by a scalar
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
+
+/*!
+ * Specialization that provides dividing a vector by a scalar
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
+{
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const& left, RIGHT_OPERAND_TYPE const&)
+  {
+    return left.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  RAJA_SUPPRESS_HD_WARN
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  divide(TILE_TYPE const& tile,
+         LEFT_OPERAND_TYPE const& left,
+         RIGHT_OPERAND_TYPE const& right)
+  {
+    result_type denominator(right.eval(tile));
+
+    if (tile.s_tensor_size == TENSOR_FULL)
+    {
+      return left.eval(tile).divide(denominator);
+    }
+    else
     {
-      using result_type = typename LEFT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-        return left.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      RAJA_SUPPRESS_HD_WARN
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        result_type denominator(right.eval(tile));
-
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return left.eval(tile).divide(denominator);
-        }
-        else{
-          return left.eval(tile).divide_nm(denominator, tile.m_size[0], tile.m_size[1]);
-        }
-      }
-    };
-
-
-    /*!
-     * Specialization that provides dividing a vector by a vector
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+      return left.eval(tile).divide_nm(denominator, tile.m_size[0],
+                                       tile.m_size[1]);
+    }
+  }
+};
+
+
+/*!
+ * Specialization that provides dividing a vector by a vector
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+{
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const& left, RIGHT_OPERAND_TYPE const&)
+  {
+    return left.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  divide(TILE_TYPE const& tile,
+         LEFT_OPERAND_TYPE const& left,
+         RIGHT_OPERAND_TYPE const& right)
+  {
+    if (tile.s_tensor_size == TENSOR_FULL)
     {
-      using result_type = typename LEFT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-        return left.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return left.eval(tile).divide(right.eval(tile));
-        }
-        else{
-          return left.eval(tile).divide_nm(right.eval(tile), tile.m_size[0], tile.m_size[1]);
-        }
-      }
-    };
-
-
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    class TensorDivide: public TensorExpressionBase<TensorDivide<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>> {
-      public:
-        using self_type = TensorDivide<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
-        using left_operand_type = LEFT_OPERAND_TYPE;
-        using right_operand_type = RIGHT_OPERAND_TYPE;
-        using element_type = typename LEFT_OPERAND_TYPE::element_type;
-        using index_type = typename LEFT_OPERAND_TYPE::index_type;
-
-        using divide_op = DivideOperator<left_operand_type, right_operand_type>;
-        using result_type = typename divide_op::result_type;
-        static constexpr camp::idx_t s_num_dims = divide_op::s_num_dims;
-
-
-      private:
-        left_operand_type m_left_operand;
-        right_operand_type m_right_operand;
-
-      public:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorDivide(left_operand_type const &left_operand, right_operand_type const &right_operand) :
-        m_left_operand{left_operand}, m_right_operand{right_operand}
-        {}
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return divide_op::getDimSize(dim, m_left_operand, m_right_operand);
-        }
-
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        result_type eval(TILE_TYPE const &tile) const
-        {
-          return divide_op::divide(tile, m_left_operand, m_right_operand);
-        }
-
-        /*!
-         * Returns the LHS of the operation, used to form contractions
-         */
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        left_operand_type const &getLeftOperand() const {
-          return m_left_operand;
-        }
-
-        /*!
-         * Returns the RHS of the operation, used to form contractions
-         */
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        right_operand_type const &getRightOperand() const {
-          return m_right_operand;
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("Divide(");
-          m_left_operand.print_ast();
-          printf(", ");
-          m_right_operand.print_ast();
-          printf(")");
-        }
-
-
-    };
-
-
-    /*
-     * Overload for:    arithmetic / tensorexpression
-
-     */
-    template<typename LHS, typename RHS,
-      typename std::enable_if<std::is_arithmetic<LHS>::value, bool>::type = true,
-      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RHS>::value, bool>::type = true>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    auto operator/(LHS const &left_operand, RHS const &right_operand) ->
-    TensorDivide<typename NormalizeOperandHelper<LHS>::return_type, RHS>
+      return left.eval(tile).divide(right.eval(tile));
+    }
+    else
     {
-      return TensorDivide<typename NormalizeOperandHelper<LHS>::return_type, RHS>(NormalizeOperandHelper<LHS>::normalize(left_operand), right_operand);
+      return left.eval(tile).divide_nm(right.eval(tile), tile.m_size[0],
+                                       tile.m_size[1]);
     }
+  }
+};
+
+
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+class TensorDivide : public TensorExpressionBase<
+                         TensorDivide<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>>
+{
+public:
+  using self_type         = TensorDivide<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
+  using left_operand_type = LEFT_OPERAND_TYPE;
+  using right_operand_type = RIGHT_OPERAND_TYPE;
+  using element_type       = typename LEFT_OPERAND_TYPE::element_type;
+  using index_type         = typename LEFT_OPERAND_TYPE::index_type;
+
+  using divide_op   = DivideOperator<left_operand_type, right_operand_type>;
+  using result_type = typename divide_op::result_type;
+  static constexpr camp::idx_t s_num_dims = divide_op::s_num_dims;
+
+
+private:
+  left_operand_type m_left_operand;
+  right_operand_type m_right_operand;
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorDivide(left_operand_type const& left_operand,
+               right_operand_type const& right_operand)
+      : m_left_operand {left_operand}, m_right_operand {right_operand}
+  {}
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
+  {
+    return divide_op::getDimSize(dim, m_left_operand, m_right_operand);
+  }
+
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const& tile) const
+  {
+    return divide_op::divide(tile, m_left_operand, m_right_operand);
+  }
+
+  /*!
+   * Returns the LHS of the operation, used to form contractions
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr left_operand_type const& getLeftOperand() const
+  {
+    return m_left_operand;
+  }
+
+  /*!
+   * Returns the RHS of the operation, used to form contractions
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr right_operand_type const& getRightOperand() const
+  {
+    return m_right_operand;
+  }
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    printf("Divide(");
+    m_left_operand.print_ast();
+    printf(", ");
+    m_right_operand.print_ast();
+    printf(")");
+  }
+};
+
+
+/*
+ * Overload for:    arithmetic / tensorexpression
+
+ */
+template <
+    typename LHS,
+    typename RHS,
+    typename std::enable_if<std::is_arithmetic<LHS>::value, bool>::type = true,
+    typename std::enable_if<
+        std::is_base_of<TensorExpressionConcreteBase, RHS>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE auto operator/(LHS const& left_operand,
+                                            RHS const& right_operand)
+    -> TensorDivide<typename NormalizeOperandHelper<LHS>::return_type, RHS>
+{
+  return TensorDivide<typename NormalizeOperandHelper<LHS>::return_type, RHS>(
+      NormalizeOperandHelper<LHS>::normalize(left_operand), right_operand);
+}
 
-  } // namespace ET
+}  // namespace ET
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp
index 6720a304f2..10367f0d5b 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp
@@ -33,76 +33,72 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+
+template <typename TENSOR_TYPE>
+class TensorLiteral : public TensorExpressionBase<TensorLiteral<TENSOR_TYPE>>
+{
+public:
+  using self_type    = TensorLiteral<TENSOR_TYPE>;
+  using tensor_type  = TENSOR_TYPE;
+  using element_type = typename TENSOR_TYPE::element_type;
+  using result_type  = tensor_type;
+  using index_type   = RAJA::Index_type;
+
+  static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
+  {
+    return tensor_type::s_dim_elem(dim);
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  explicit TensorLiteral(tensor_type const& value) : m_value {value} {}
+
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const&) const
+  {
+    return result_type(m_value);
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const { printf("TensorLiteral()"); }
+
+private:
+  tensor_type m_value;
+};
+
+
+/*
+ * For TensorRegister nodes, we need to wrap this in a constant value ET node
+ */
+template <typename RHS>
+struct NormalizeOperandHelper<
+    RHS,
+    typename std::enable_if<
+        std::is_base_of<TensorRegisterConcreteBase, RHS>::value>::type>
+{
+  using return_type = TensorLiteral<RHS>;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr return_type normalize(RHS const& rhs)
   {
+    return return_type(rhs);
+  }
+};
 
+}  // namespace ET
 
-    template<typename TENSOR_TYPE>
-    class TensorLiteral :  public TensorExpressionBase<TensorLiteral<TENSOR_TYPE>> {
-      public:
-        using self_type = TensorLiteral<TENSOR_TYPE>;
-        using tensor_type = TENSOR_TYPE;
-        using element_type = typename TENSOR_TYPE::element_type;
-        using result_type = tensor_type;
-        using index_type = RAJA::Index_type;
-
-        static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return tensor_type::s_dim_elem(dim);
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        explicit
-        TensorLiteral(tensor_type const &value) :
-        m_value{value}
-        {}
-
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        result_type eval(TILE_TYPE const &) const {
-          return result_type(m_value);
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("TensorLiteral()");
-        }
-
-      private:
-        tensor_type m_value;
-    };
-
-
-    /*
-     * For TensorRegister nodes, we need to wrap this in a constant value ET node
-     */
-    template<typename RHS>
-    struct NormalizeOperandHelper<RHS,
-    typename std::enable_if<std::is_base_of<TensorRegisterConcreteBase, RHS>::value>::type>
-    {
-        using return_type = TensorLiteral<RHS>;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        return_type normalize(RHS const &rhs){
-          return return_type(rhs);
-        }
-    };
-
-  } // namespace ET
-
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp
index 3b69552a32..00e5b14bf5 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp
@@ -34,220 +34,185 @@ namespace expt
 {
 
 
+namespace ET
+{
 
 
+template <typename STORAGE, typename LHS_TYPE, typename RHS_TYPE>
+struct TensorStoreFunctor
+{
+  LHS_TYPE const& m_lhs;
+  RHS_TYPE const& m_rhs;
 
-  namespace ET
+  template <typename TILE_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE void operator()(TILE_TYPE const& tile) const
   {
 
 
+    /*
+     *
+     * For recursive ET types, eval() produces a new ET, and
+     * eval_lhs() produces a new TensorLoadStore.
+     *
+     */
 
-    template<typename STORAGE, typename LHS_TYPE, typename RHS_TYPE>
-    struct TensorStoreFunctor
-    {
-        LHS_TYPE const &m_lhs;
-        RHS_TYPE const &m_rhs;
-
-        template<typename TILE_TYPE>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void operator()(TILE_TYPE const &tile) const {
-
-
-          /*
-           *
-           * For recursive ET types, eval() produces a new ET, and
-           * eval_lhs() produces a new TensorLoadStore.
-           *
-           */
-
-          m_lhs.eval_lhs(tile) = m_rhs.eval(tile);
-
-        }
-    };
-
-    template<typename STORAGE, typename LHS_TYPE, typename RHS_TYPE>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    auto makeTensorStoreFunctor(LHS_TYPE const &lhs, RHS_TYPE const &rhs) ->
-    TensorStoreFunctor<STORAGE, LHS_TYPE, RHS_TYPE>
-    {
-      return TensorStoreFunctor<STORAGE, LHS_TYPE, RHS_TYPE>{lhs, rhs};
-    }
-
-
-    template<typename TENSOR_TYPE, typename REF_TYPE>
-    class TensorLoadStore : public TensorExpressionBase<TensorLoadStore<TENSOR_TYPE, REF_TYPE>> {
-      public:
-        using self_type = TensorLoadStore<TENSOR_TYPE, REF_TYPE>;
-        using tensor_type = TENSOR_TYPE;
-        using element_type = typename TENSOR_TYPE::element_type;
-        using index_type = typename REF_TYPE::index_type;
-        using ref_type = REF_TYPE;
-        using tile_type = typename REF_TYPE::tile_type;
-        using result_type = TENSOR_TYPE;
-
-        static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
-
-
-      private:
-        ref_type m_ref;
-
-
-      public:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        explicit
-        TensorLoadStore(ref_type const &ref) : m_ref{ref}
-        {
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorLoadStore(self_type const &rhs) : m_ref(rhs.m_ref)
-        {}
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print() const {
-          printf("TensorLoadStore: ");
-          m_ref.m_tile.print();
-        }
-
-//        RAJA_SUPPRESS_HD_WARN
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type &operator=(self_type const &rhs)
-        {
-          store(rhs);
-          return *this;
-        }
-
-//        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type &operator=(RHS const &rhs)
-        {
-
-          store(normalizeOperand(rhs));
-
-          return *this;
-        }
-
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type &operator+=(RHS const &rhs)
-        {
-          store( normalizeOperand(rhs) + (*this) );
-          return *this;
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type &operator-=(RHS const &rhs)
-        {
-          store(TensorSubtract<self_type, RHS>(*this, normalizeOperand(rhs)) );
-          return *this;
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type operator*=(RHS const &rhs)
-        {
-          store(TensorMultiply<self_type, RHS>(*this, normalizeOperand(rhs)) );
-          return *this;
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type operator/=(RHS const &rhs)
-        {
-          store(TensorDivide<self_type, RHS>(*this, normalizeOperand(rhs)) );
-          return *this;
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval(TILE_TYPE const &tile) const ->
-          decltype(tensor_type::s_load_ref(merge_ref_tile(m_ref, tile)))
-        {
-          return tensor_type::s_load_ref(merge_ref_tile(m_ref, tile));
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval_lhs(TILE_TYPE const &tile) const ->
-          decltype(TENSOR_TYPE::create_et_store_ref(merge_ref_tile(this->m_ref, tile)))
-        {
-          return TENSOR_TYPE::create_et_store_ref(merge_ref_tile(m_ref, tile));
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return m_ref.m_tile.m_size[dim];
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("Load()");
-        }
-
-      private:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        tile_type const &getTile() const {
-          return m_ref.m_tile;
-        }
-
-
-        template<typename RHS>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void store(RHS const &rhs)
-        {
-#ifdef RAJA_DEBUG_PRINT_ET_AST
-          printf("Store(");
-          rhs.print_ast();
-          printf(")\n");
-#endif
+    m_lhs.eval_lhs(tile) = m_rhs.eval(tile);
+  }
+};
+
+template <typename STORAGE, typename LHS_TYPE, typename RHS_TYPE>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto
+makeTensorStoreFunctor(LHS_TYPE const& lhs, RHS_TYPE const& rhs)
+    -> TensorStoreFunctor<STORAGE, LHS_TYPE, RHS_TYPE>
+{
+  return TensorStoreFunctor<STORAGE, LHS_TYPE, RHS_TYPE> {lhs, rhs};
+}
+
+
+template <typename TENSOR_TYPE, typename REF_TYPE>
+class TensorLoadStore
+    : public TensorExpressionBase<TensorLoadStore<TENSOR_TYPE, REF_TYPE>>
+{
+public:
+  using self_type    = TensorLoadStore<TENSOR_TYPE, REF_TYPE>;
+  using tensor_type  = TENSOR_TYPE;
+  using element_type = typename TENSOR_TYPE::element_type;
+  using index_type   = typename REF_TYPE::index_type;
+  using ref_type     = REF_TYPE;
+  using tile_type    = typename REF_TYPE::tile_type;
+  using result_type  = TENSOR_TYPE;
+
+  static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
 
-          tensorTileExec<tensor_type>(m_ref.m_tile,
-              makeTensorStoreFunctor<tensor_type>(*this, rhs));
-        }
 
+private:
+  ref_type m_ref;
+
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  explicit TensorLoadStore(ref_type const& ref) : m_ref {ref} {}
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorLoadStore(self_type const& rhs) : m_ref(rhs.m_ref) {}
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print() const
+  {
+    printf("TensorLoadStore: ");
+    m_ref.m_tile.print();
+  }
+
+  //        RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator=(self_type const& rhs)
+  {
+    store(rhs);
+    return *this;
+  }
+
+  //        RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& operator=(RHS const& rhs)
+  {
+
+    store(normalizeOperand(rhs));
 
+    return *this;
+  }
 
 
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& operator+=(RHS const& rhs)
+  {
+    store(normalizeOperand(rhs) + (*this));
+    return *this;
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& operator-=(RHS const& rhs)
+  {
+    store(TensorSubtract<self_type, RHS>(*this, normalizeOperand(rhs)));
+    return *this;
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type operator*=(RHS const& rhs)
+  {
+    store(TensorMultiply<self_type, RHS>(*this, normalizeOperand(rhs)));
+    return *this;
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type operator/=(RHS const& rhs)
+  {
+    store(TensorDivide<self_type, RHS>(*this, normalizeOperand(rhs)));
+    return *this;
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE const& tile) const
+      -> decltype(tensor_type::s_load_ref(merge_ref_tile(m_ref, tile)))
+  {
+    return tensor_type::s_load_ref(merge_ref_tile(m_ref, tile));
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval_lhs(TILE_TYPE const& tile) const
+      -> decltype(TENSOR_TYPE::create_et_store_ref(merge_ref_tile(this->m_ref,
+                                                                  tile)))
+  {
+    return TENSOR_TYPE::create_et_store_ref(merge_ref_tile(m_ref, tile));
+  }
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
+  {
+    return m_ref.m_tile.m_size[dim];
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const { printf("Load()"); }
+
+private:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  tile_type const& getTile() const { return m_ref.m_tile; }
+
+
+  template <typename RHS>
+  RAJA_INLINE RAJA_HOST_DEVICE void store(RHS const& rhs)
+  {
+#ifdef RAJA_DEBUG_PRINT_ET_AST
+    printf("Store(");
+    rhs.print_ast();
+    printf(")\n");
+#endif
 
-    };
+    tensorTileExec<tensor_type>(
+        m_ref.m_tile, makeTensorStoreFunctor<tensor_type>(*this, rhs));
+  }
+};
 
 
-  } // namespace ET
+}  // namespace ET
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
index 3e3429588f..b51aa3d8d6 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
@@ -33,127 +33,136 @@ namespace internal
 namespace expt
 {
 
-  namespace ET
+namespace ET
+{
+
+// forward decl for FMA contraction
+template <typename LEFT_OPERAND_TYPE,
+          typename RIGHT_OPERAND_TYPE,
+          typename ADD_TYPE>
+class TensorMultiplyAdd;
+
+
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+class TensorMultiply
+    : public TensorExpressionBase<
+          TensorMultiply<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>>
+{
+public:
+  using self_type = TensorMultiply<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
+  using left_operand_type  = LEFT_OPERAND_TYPE;
+  using right_operand_type = RIGHT_OPERAND_TYPE;
+  using multiply_op = MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
+
+  using element_type = typename LEFT_OPERAND_TYPE::element_type;
+  using index_type   = typename LEFT_OPERAND_TYPE::index_type;
+
+  using result_type                       = typename multiply_op::result_type;
+  static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
+
+private:
+  left_operand_type m_left_operand;
+  right_operand_type m_right_operand;
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorMultiply(left_operand_type const& left_operand,
+                 right_operand_type const& right_operand)
+      : m_left_operand {left_operand}, m_right_operand {right_operand}
+  {}
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr int getDimSize(int dim) const
+  {
+    return multiply_op::getDimSize(dim, m_left_operand, m_right_operand);
+  }
+
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE const& tile) const
+      -> decltype(multiply_op::multiply(tile, m_left_operand, m_right_operand))
+  {
+    return multiply_op::multiply(tile, m_left_operand, m_right_operand);
+  }
+
+  /*!
+   * Returns the LHS of the operation, used to form contractions
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr left_operand_type const& getLeftOperand() const
+  {
+    return m_left_operand;
+  }
+
+  /*!
+   * Returns the RHS of the operation, used to form contractions
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr right_operand_type const& getRightOperand() const
   {
+    return m_right_operand;
+  }
+
+
+  /*!
+   * operator+ overload that forms a FMA contraction
+   */
+  RAJA_SUPPRESS_HD_WARN
+  template <typename ADD>
+  RAJA_INLINE RAJA_HOST_DEVICE TensorMultiplyAdd<left_operand_type,
+                                                 right_operand_type,
+                                                 normalize_operand_t<ADD>>
+  operator+(ADD const& add) const
+  {
+    return TensorMultiplyAdd<left_operand_type, right_operand_type,
+                             normalize_operand_t<ADD>>(
+        m_left_operand, m_right_operand, normalizeOperand(add));
+  }
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    printf("Multiply[");
+    multiply_op::print_ast();
+    printf("](");
+    m_left_operand.print_ast();
+    printf(", ");
+    m_right_operand.print_ast();
+    printf(")");
+  }
+};
+
+
+/*
+ * Overload for:    arithmetic * tensorexpression
+
+ */
+template <
+    typename LHS,
+    typename RHS,
+    typename std::enable_if<std::is_arithmetic<LHS>::value, bool>::type = true,
+    typename std::enable_if<
+        std::is_base_of<TensorExpressionConcreteBase, RHS>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE auto operator*(LHS const& left_operand,
+                                            RHS const& right_operand)
+    -> TensorMultiply<typename NormalizeOperandHelper<LHS>::return_type, RHS>
+{
+  return TensorMultiply<typename NormalizeOperandHelper<LHS>::return_type, RHS>(
+      NormalizeOperandHelper<LHS>::normalize(left_operand), right_operand);
+}
+
+}  // namespace ET
 
-    // forward decl for FMA contraction
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, typename ADD_TYPE>
-    class TensorMultiplyAdd;
-
-
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    class TensorMultiply : public TensorExpressionBase<TensorMultiply<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>> {
-      public:
-        using self_type = TensorMultiply<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
-        using left_operand_type = LEFT_OPERAND_TYPE;
-        using right_operand_type = RIGHT_OPERAND_TYPE;
-        using multiply_op = MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
-
-        using element_type = typename LEFT_OPERAND_TYPE::element_type;
-        using index_type = typename LEFT_OPERAND_TYPE::index_type;
-
-        using result_type = typename multiply_op::result_type;
-        static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
-
-      private:
-        left_operand_type m_left_operand;
-        right_operand_type m_right_operand;
-
-      public:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorMultiply(left_operand_type const &left_operand, right_operand_type const &right_operand) :
-        m_left_operand{left_operand}, m_right_operand{right_operand}
-        {}
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        int getDimSize(int dim) const {
-          return multiply_op::getDimSize(dim, m_left_operand, m_right_operand);
-        }
-
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval(TILE_TYPE const &tile) const ->
-          decltype(multiply_op::multiply(tile, m_left_operand, m_right_operand))
-        {
-          return multiply_op::multiply(tile, m_left_operand, m_right_operand);
-        }
-
-        /*!
-         * Returns the LHS of the operation, used to form contractions
-         */
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        left_operand_type const &getLeftOperand() const {
-          return m_left_operand;
-        }
-
-        /*!
-         * Returns the RHS of the operation, used to form contractions
-         */
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        right_operand_type const &getRightOperand() const {
-          return m_right_operand;
-        }
-
-
-        /*!
-         * operator+ overload that forms a FMA contraction
-         */
-        RAJA_SUPPRESS_HD_WARN
-        template<typename ADD>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorMultiplyAdd<left_operand_type, right_operand_type, normalize_operand_t<ADD>>
-        operator+(ADD const &add) const {
-          return TensorMultiplyAdd<left_operand_type, right_operand_type, normalize_operand_t<ADD>>(m_left_operand, m_right_operand, normalizeOperand(add));
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("Multiply[");
-          multiply_op::print_ast();
-          printf("](");
-          m_left_operand.print_ast();
-          printf(", ");
-          m_right_operand.print_ast();
-          printf(")");
-        }
-
-
-    };
-
-
-    /*
-     * Overload for:    arithmetic * tensorexpression
-
-     */
-    template<typename LHS, typename RHS,
-      typename std::enable_if<std::is_arithmetic<LHS>::value, bool>::type = true,
-      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RHS>::value, bool>::type = true>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    auto operator*(LHS const &left_operand, RHS const &right_operand) ->
-    TensorMultiply<typename NormalizeOperandHelper<LHS>::return_type, RHS>
-    {
-      return TensorMultiply<typename NormalizeOperandHelper<LHS>::return_type, RHS>(NormalizeOperandHelper<LHS>::normalize(left_operand), right_operand);
-    }
-
-  } // namespace ET
-
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
index 44f27e92c7..a15059ed13 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
@@ -33,81 +33,90 @@ namespace expt
 {
 
 
-  namespace ET
-  {
-
-
-    /*!
-     * Expression for LHS*RHS+ADD, which allows for accessing FMA style
-     * operations.
-     *
-     * This ET can only be generated by contracting an Add and Multiple ET.
-     *
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, typename ADD_OPERAND_TYPE>
-    class TensorMultiplyAdd : public TensorExpressionBase<TensorMultiplyAdd<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE, ADD_OPERAND_TYPE>> {
-      public:
-        using self_type = TensorMultiplyAdd<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE, ADD_OPERAND_TYPE>;
-        using left_operand_type = LEFT_OPERAND_TYPE;
-        using right_operand_type = RIGHT_OPERAND_TYPE;
-        using add_operand_type = ADD_OPERAND_TYPE;
-        using multiply_op = MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
-
-        using element_type = typename LEFT_OPERAND_TYPE::element_type;
-        using index_type = typename LEFT_OPERAND_TYPE::index_type;
-
-        using result_type = typename multiply_op::result_type;
-        static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
-
-      private:
-        left_operand_type m_left_operand;
-        right_operand_type m_right_operand;
-        add_operand_type m_add_operand;
-
-      public:
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorMultiplyAdd(left_operand_type const &left_operand, right_operand_type const &right_operand,
-                          add_operand_type const &add_operand) :
-        m_left_operand{left_operand}, m_right_operand{right_operand}, m_add_operand{add_operand}
-        {}
-
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval(TILE_TYPE const &tile) const ->
-          decltype(multiply_op::multiply_add(tile, m_left_operand, m_right_operand, m_add_operand))
-        {
-          return multiply_op::multiply_add(tile, m_left_operand, m_right_operand, m_add_operand);
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("MultiplyAdd[");
-          multiply_op::print_ast();
-          printf("](");
-          m_left_operand.print_ast();
-          printf(", ");
-          m_right_operand.print_ast();
-          printf(", ");
-          m_add_operand.print_ast();
-          printf(")");
-        }
-
-
-
-    };
-
+namespace ET
+{
 
 
+/*!
+ * Expression for LHS*RHS+ADD, which allows for accessing FMA style
+ * operations.
+ *
+ * This ET can only be generated by contracting an Add and Multiple ET.
+ *
+ */
+template <typename LEFT_OPERAND_TYPE,
+          typename RIGHT_OPERAND_TYPE,
+          typename ADD_OPERAND_TYPE>
+class TensorMultiplyAdd
+    : public TensorExpressionBase<TensorMultiplyAdd<LEFT_OPERAND_TYPE,
+                                                    RIGHT_OPERAND_TYPE,
+                                                    ADD_OPERAND_TYPE>>
+{
+public:
+  using self_type          = TensorMultiplyAdd<LEFT_OPERAND_TYPE,
+                                      RIGHT_OPERAND_TYPE,
+                                      ADD_OPERAND_TYPE>;
+  using left_operand_type  = LEFT_OPERAND_TYPE;
+  using right_operand_type = RIGHT_OPERAND_TYPE;
+  using add_operand_type   = ADD_OPERAND_TYPE;
+  using multiply_op = MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
+
+  using element_type = typename LEFT_OPERAND_TYPE::element_type;
+  using index_type   = typename LEFT_OPERAND_TYPE::index_type;
+
+  using result_type                       = typename multiply_op::result_type;
+  static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
+
+private:
+  left_operand_type m_left_operand;
+  right_operand_type m_right_operand;
+  add_operand_type m_add_operand;
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorMultiplyAdd(left_operand_type const& left_operand,
+                    right_operand_type const& right_operand,
+                    add_operand_type const& add_operand)
+      : m_left_operand {left_operand},
+        m_right_operand {right_operand},
+        m_add_operand {add_operand}
+  {}
+
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE const& tile) const
+      -> decltype(multiply_op::multiply_add(tile,
+                                            m_left_operand,
+                                            m_right_operand,
+                                            m_add_operand))
+  {
+    return multiply_op::multiply_add(tile, m_left_operand, m_right_operand,
+                                     m_add_operand);
+  }
 
-  } // namespace ET
 
-  } // namespace internal
-} // namespace expt
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    printf("MultiplyAdd[");
+    multiply_op::print_ast();
+    printf("](");
+    m_left_operand.print_ast();
+    printf(", ");
+    m_right_operand.print_ast();
+    printf(", ");
+    m_add_operand.print_ast();
+    printf(")");
+  }
+};
+
+
+}  // namespace ET
+
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp
index d5211e4963..f0512665cf 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp
@@ -33,61 +33,58 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+template <typename ET_TYPE>
+class TensorNegate : public TensorExpressionBase<TensorNegate<ET_TYPE>>
+{
+public:
+  using self_type    = TensorNegate<ET_TYPE>;
+  using rhs_type     = ET_TYPE;
+  using tensor_type  = typename ET_TYPE::result_type;
+  using element_type = typename tensor_type::element_type;
+  using index_type   = typename ET_TYPE::index_type;
+
+  using result_type                       = tensor_type;
+  using tile_type                         = typename ET_TYPE::tile_type;
+  static constexpr camp::idx_t s_num_dims = ET_TYPE::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorNegate(rhs_type const& tensor) : m_tensor {tensor} {}
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
+  {
+    return m_tensor.getDimSize(dim);
+  }
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const& tile) const
   {
+    return m_tensor.eval(tile).scale(-1);
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    printf("Negate(");
+    m_tensor.print_ast();
+    printf(")");
+  }
+
+private:
+  rhs_type m_tensor;
+};
+
+
+}  // namespace ET
 
-    template<typename ET_TYPE>
-    class TensorNegate :  public TensorExpressionBase<TensorNegate<ET_TYPE>> {
-      public:
-        using self_type = TensorNegate<ET_TYPE>;
-        using rhs_type = ET_TYPE;
-        using tensor_type = typename ET_TYPE::result_type;
-        using element_type = typename tensor_type::element_type;
-        using index_type = typename ET_TYPE::index_type;
-
-        using result_type = tensor_type;
-        using tile_type = typename ET_TYPE::tile_type;
-        static constexpr camp::idx_t s_num_dims = ET_TYPE::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorNegate(rhs_type const &tensor) :
-        m_tensor{tensor}
-        {}
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return m_tensor.getDimSize(dim);
-        }
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        result_type eval(TILE_TYPE const &tile) const
-        {
-          return m_tensor.eval(tile).scale(-1);
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("Negate(");
-          m_tensor.print_ast();
-          printf(")");
-        }
-
-      private:
-        rhs_type m_tensor;
-    };
-
-
-
-  } // namespace ET
-
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp
index 4ab0a3ebc6..ac692c3bcf 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp
@@ -33,78 +33,71 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+
+template <typename T>
+class TensorScalarLiteral : public TensorExpressionBase<TensorScalarLiteral<T>>
+{
+public:
+  using self_type    = TensorScalarLiteral<T>;
+  using tensor_type  = RAJA::expt::ScalarRegister<T>;
+  using element_type = T;
+  using result_type  = T;
+  using index_type   = RAJA::Index_type;
+
+  static constexpr camp::idx_t s_num_dims = 0;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type) const { return 0; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  explicit constexpr TensorScalarLiteral(element_type const& value) noexcept
+      : m_value {value}
+  {}
+
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE element_type eval(TILE_TYPE const&) const
   {
+    return m_value;
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const { printf("ScalarLiteral(%e)", (double)m_value); }
+
+private:
+  element_type m_value;
+};
 
 
-    template<typename T>
-    class TensorScalarLiteral :  public TensorExpressionBase<TensorScalarLiteral<T>> {
-      public:
-        using self_type = TensorScalarLiteral<T>;
-        using tensor_type = RAJA::expt::ScalarRegister<T>;
-        using element_type = T;
-        using result_type = T;
-        using index_type = RAJA::Index_type;
-
-        static constexpr camp::idx_t s_num_dims = 0;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type ) const {
-          return 0;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        explicit
-        constexpr
-        TensorScalarLiteral(element_type const &value) noexcept :
-        m_value{value}
-        {}
-
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        element_type eval(TILE_TYPE const &) const {
-          return m_value;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("ScalarLiteral(%e)", (double)m_value);
-        }
-
-      private:
-        element_type m_value;
-    };
-
-
-    /*
-     * For arithmetic values, we need to wrap in a constant value ET node
-     */
-    template<typename RHS>
-    struct NormalizeOperandHelper<RHS,
+/*
+ * For arithmetic values, we need to wrap in a constant value ET node
+ */
+template <typename RHS>
+struct NormalizeOperandHelper<
+    RHS,
     typename std::enable_if<std::is_arithmetic<RHS>::value>::type>
-    {
-        using return_type = TensorScalarLiteral<RHS>;
+{
+  using return_type = TensorScalarLiteral<RHS>;
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        return_type normalize(RHS const &rhs){
-          return return_type(rhs);
-        }
-    };
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr return_type normalize(RHS const& rhs)
+  {
+    return return_type(rhs);
+  }
+};
 
 
-  } // namespace ET
+}  // namespace ET
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp
index 46950eec6f..a1e9fa4542 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp
@@ -33,67 +33,63 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+template <typename ET_TYPE>
+class TensorTranspose : public TensorExpressionBase<TensorTranspose<ET_TYPE>>
+{
+public:
+  using self_type    = TensorTranspose<ET_TYPE>;
+  using rhs_type     = ET_TYPE;
+  using tensor_type  = typename ET_TYPE::result_type;
+  using element_type = typename tensor_type::element_type;
+  using index_type   = typename ET_TYPE::index_type;
+
+  using result_type                       = tensor_type;
+  using tile_type                         = typename ET_TYPE::tile_type;
+  static constexpr camp::idx_t s_num_dims = ET_TYPE::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorTranspose(rhs_type const& tensor) : m_tensor {tensor} {}
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
   {
+    return m_tensor.getDimSize(dim);
+  }
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const& tile) const
+  {
+    // transpose which tile we are returning
+    TILE_TYPE trans_tile {{tile.m_begin[1], tile.m_begin[0]},
+                          {tile.m_size[1], tile.m_size[0]}};
+
+    // evaluate and return the transposed tile
+    return m_tensor.eval(trans_tile).transpose();
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    printf("Transpose(");
+    m_tensor.print_ast();
+    printf(")");
+  }
+
+private:
+  rhs_type m_tensor;
+};
+
+
+}  // namespace ET
 
-    template<typename ET_TYPE>
-    class TensorTranspose :  public TensorExpressionBase<TensorTranspose<ET_TYPE>> {
-      public:
-        using self_type = TensorTranspose<ET_TYPE>;
-        using rhs_type = ET_TYPE;
-        using tensor_type = typename ET_TYPE::result_type;
-        using element_type = typename tensor_type::element_type;
-        using index_type = typename ET_TYPE::index_type;
-
-        using result_type = tensor_type;
-        using tile_type = typename ET_TYPE::tile_type;
-        static constexpr camp::idx_t s_num_dims = ET_TYPE::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorTranspose(rhs_type const &tensor) :
-        m_tensor{tensor}
-        {}
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return m_tensor.getDimSize(dim);
-        }
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        result_type eval(TILE_TYPE const &tile) const {
-          // transpose which tile we are returning
-          TILE_TYPE trans_tile{
-            {tile.m_begin[1], tile.m_begin[0]},
-            {tile.m_size[1],  tile.m_size[0]}
-          };
-
-          // evaluate and return the transposed tile
-          return m_tensor.eval(trans_tile).transpose();
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("Transpose(");
-          m_tensor.print_ast();
-          printf(")");
-        }
-
-      private:
-        rhs_type m_tensor;
-    };
-
-
-
-  } // namespace ET
-
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp b/include/RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp
index 2a868a3131..7f3059acdf 100644
--- a/include/RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp
@@ -33,64 +33,57 @@ namespace expt
 {
 
 
-    class TensorRegisterConcreteBase;
-
-  namespace ET
-  {
-    class TensorExpressionConcreteBase;
-
-    template<typename RHS, typename enable = void>
-    struct NormalizeOperandHelper;
-
-
-    /*
-     * For TensorExpression nodes, we just return them as-is.
-     */
-    template<typename RHS>
-    struct NormalizeOperandHelper<RHS,
-    typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RHS>::value>::type>
-    {
-        using return_type = RHS;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        return_type normalize(RHS const &rhs){
-          return rhs;
-        }
-    };
-
-
-
-
-    /**
-     * Allows uniform packaging up of operands into ExpressionTemplates.
-     *
-     * The NormalizeOperandHelper is specialized throughout the code in order
-     * to convert non-ET operands into ET objects
-     *
-     * ET operators can then take any operand type, and use this to convert
-     * them into ET types the same way.
-     */
-    template<typename RHS>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    auto normalizeOperand(RHS const &rhs) ->
+class TensorRegisterConcreteBase;
+
+namespace ET
+{
+class TensorExpressionConcreteBase;
+
+template <typename RHS, typename enable = void>
+struct NormalizeOperandHelper;
+
+
+/*
+ * For TensorExpression nodes, we just return them as-is.
+ */
+template <typename RHS>
+struct NormalizeOperandHelper<
+    RHS,
+    typename std::enable_if<
+        std::is_base_of<TensorExpressionConcreteBase, RHS>::value>::type>
+{
+  using return_type = RHS;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr return_type normalize(RHS const& rhs) { return rhs; }
+};
+
+
+/**
+ * Allows uniform packaging up of operands into ExpressionTemplates.
+ *
+ * The NormalizeOperandHelper is specialized throughout the code in order
+ * to convert non-ET operands into ET objects
+ *
+ * ET operators can then take any operand type, and use this to convert
+ * them into ET types the same way.
+ */
+template <typename RHS>
+RAJA_INLINE RAJA_HOST_DEVICE auto normalizeOperand(RHS const& rhs) ->
     typename NormalizeOperandHelper<RHS>::return_type
-    {
-      return NormalizeOperandHelper<RHS>::normalize(rhs);
-    }
+{
+  return NormalizeOperandHelper<RHS>::normalize(rhs);
+}
 
-    template<typename RHS>
-    using normalize_operand_t =
-        typename NormalizeOperandHelper<RHS>::return_type;
+template <typename RHS>
+using normalize_operand_t = typename NormalizeOperandHelper<RHS>::return_type;
 
 
-  } // namespace ET
+}  // namespace ET
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ExpressionTemplate.hpp b/include/RAJA/pattern/tensor/internal/ExpressionTemplate.hpp
index 2b6bf7304d..a94ec924db 100644
--- a/include/RAJA/pattern/tensor/internal/ExpressionTemplate.hpp
+++ b/include/RAJA/pattern/tensor/internal/ExpressionTemplate.hpp
@@ -31,5 +31,4 @@
 #include "RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp"
 
 
-
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp b/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
index 08a9886acc..bb5ef862cc 100644
--- a/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
+++ b/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
@@ -31,308 +31,325 @@ namespace expt
 {
 
 
+template <typename MATA, typename MATB>
+struct MatrixMatrixMultiplyHelper;
 
 
+/**
+ *
+ * Row-Major * Row-Major ==> Row-Major
+ *
+ */
+template <typename T,
+          typename REGISTER_POLICY,
+          camp::idx_t N_SIZE,
+          camp::idx_t M_SIZE,
+          camp::idx_t M2_SIZE,
+          camp::idx_t O_SIZE>
+struct MatrixMatrixMultiplyHelper<
+    RAJA::expt::TensorRegister<REGISTER_POLICY,
+                               T,
+                               RAJA::expt::RowMajorLayout,
+                               camp::idx_seq<N_SIZE, M_SIZE>>,
+    RAJA::expt::TensorRegister<REGISTER_POLICY,
+                               T,
+                               RAJA::expt::RowMajorLayout,
+                               camp::idx_seq<M2_SIZE, O_SIZE>>>
+{
 
-
-
-  template<typename MATA, typename MATB>
-  struct MatrixMatrixMultiplyHelper;
-
-
-
-  /**
-   *
-   * Row-Major * Row-Major ==> Row-Major
+  static_assert(M_SIZE == M2_SIZE,
+                "Matrices are not compatible for multiplication");
+
+  using left_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                               T,
+                                               RAJA::expt::RowMajorLayout,
+                                               camp::idx_seq<N_SIZE, M_SIZE>>;
+
+  using right_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                                T,
+                                                RAJA::expt::RowMajorLayout,
+                                                camp::idx_seq<M_SIZE, O_SIZE>>;
+
+  using result_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                                 T,
+                                                 RAJA::expt::RowMajorLayout,
+                                                 camp::idx_seq<N_SIZE, O_SIZE>>;
+
+  using register_type = typename result_type::register_type;
+
+  static constexpr camp::idx_t s_elements_per_register =
+      left_type::s_elements_per_register;
+  static constexpr camp::idx_t s_A_minor_dim_registers =
+      left_type::s_minor_dim_registers;
+  static constexpr camp::idx_t s_B_minor_dim_registers =
+      right_type::s_minor_dim_registers;
+  static constexpr camp::idx_t s_C_minor_dim_registers =
+      result_type::s_minor_dim_registers;
+
+  /*
+   * Matrix B (and C) has 1 more more registers per row
    *
    */
-  template<typename T, typename REGISTER_POLICY, camp::idx_t N_SIZE, camp::idx_t M_SIZE, camp::idx_t M2_SIZE, camp::idx_t O_SIZE>
-  struct MatrixMatrixMultiplyHelper<
-  RAJA::expt::TensorRegister<REGISTER_POLICY,
-                   T,
-                   RAJA::expt::RowMajorLayout,
-                   camp::idx_seq<N_SIZE, M_SIZE>>,
-                   RAJA::expt::TensorRegister<REGISTER_POLICY,
-                    T,
-                    RAJA::expt::RowMajorLayout,
-                    camp::idx_seq<M2_SIZE, O_SIZE>> >
-    {
-
-      static_assert(M_SIZE == M2_SIZE, "Matrices are not compatible for multiplication");
-
-      using left_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                       T,
-                                       RAJA::expt::RowMajorLayout,
-                                       camp::idx_seq<N_SIZE, M_SIZE>>;
-
-      using right_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                        T,
-                                        RAJA::expt::RowMajorLayout,
-                                        camp::idx_seq<M_SIZE, O_SIZE>> ;
-
-      using result_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                         T,
-                                         RAJA::expt::RowMajorLayout,
-                                         camp::idx_seq<N_SIZE, O_SIZE>> ;
-
-      using register_type = typename result_type::register_type;
-
-      static constexpr camp::idx_t s_elements_per_register = left_type::s_elements_per_register;
-      static constexpr camp::idx_t s_A_minor_dim_registers = left_type::s_minor_dim_registers;
-      static constexpr camp::idx_t s_B_minor_dim_registers = right_type::s_minor_dim_registers;
-      static constexpr camp::idx_t s_C_minor_dim_registers = result_type::s_minor_dim_registers;
-
-      /*
-       * Matrix B (and C) has 1 more more registers per row
-       *
-       */
-      template<typename dummy = void>
-      RAJA_HOST_DEVICE
-      static
-      RAJA_INLINE
+  template <typename dummy = void>
+  RAJA_HOST_DEVICE static RAJA_INLINE
       typename std::enable_if<(s_C_minor_dim_registers != 0), dummy>::type
-      multiply_accumulate(left_type const &A, right_type const &B, result_type &C)
-      {
+      multiply_accumulate(left_type const& A,
+                          right_type const& B,
+                          result_type& C)
+  {
 #if defined(RAJA_ENABLE_VECTOR_STATS) && !defined(__CUDA_ARCH__)
-        RAJA::tensor_stats::num_matrix_mm_multacc_row_row ++;
+    RAJA::tensor_stats::num_matrix_mm_multacc_row_row++;
 #endif
 
-        constexpr camp::idx_t num_bc_reg_per_row = s_C_minor_dim_registers;
+    constexpr camp::idx_t num_bc_reg_per_row = s_C_minor_dim_registers;
 
-        RAJA_UNROLL
-        for(camp::idx_t c_reg = 0;c_reg < result_type::s_num_registers;++ c_reg){
-          camp::idx_t bc_col_reg = c_reg % num_bc_reg_per_row;
-          camp::idx_t ac_row = c_reg / num_bc_reg_per_row;
-
-          RAJA_UNROLL
-          for(camp::idx_t a_col = 0;a_col < M_SIZE;++ a_col){
-            camp::idx_t b_reg = a_col * num_bc_reg_per_row + bc_col_reg;
-
-            C.get_register(c_reg) =
-                register_type(A.get(ac_row, a_col)).multiply_add(
-                    B.get_register(b_reg),
-                    C.get_register(c_reg));
-          }
-        }
-
-      }
+    RAJA_UNROLL
+    for (camp::idx_t c_reg = 0; c_reg < result_type::s_num_registers; ++c_reg)
+    {
+      camp::idx_t bc_col_reg = c_reg % num_bc_reg_per_row;
+      camp::idx_t ac_row     = c_reg / num_bc_reg_per_row;
 
-      /*
-       * Matrix B (and C) have less than one register per row
-       *
-       */
-      template<typename dummy = void>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      typename std::enable_if<(s_C_minor_dim_registers == 0), dummy>::type
-      multiply_accumulate(left_type const &A, right_type const &B, result_type &C)
+      RAJA_UNROLL
+      for (camp::idx_t a_col = 0; a_col < M_SIZE; ++a_col)
       {
-        constexpr camp::idx_t bc_segbits = result_type::s_segbits;
-        constexpr camp::idx_t a_segments_per_register = 1<<bc_segbits;
-
-        RAJA_UNROLL
-        for(camp::idx_t ac_row = 0;ac_row < N_SIZE;++ ac_row){
-          camp::idx_t c_reg     = ac_row / result_type::s_major_dim_per_register;
-          camp::idx_t c_segment = ac_row % result_type::s_major_dim_per_register;
-          register_type c_tmp;
-
-          RAJA_UNROLL
-          for(camp::idx_t b_reg = 0;b_reg < right_type::s_num_registers;++ b_reg){
-
-            camp::idx_t a_segment = ac_row*right_type::s_num_registers + b_reg;
-            camp::idx_t a_reg = a_segment / a_segments_per_register;
-            camp::idx_t a_reg_segment = a_segment % a_segments_per_register;
-
-            auto a_tmp = A.get_register(a_reg).segmented_broadcast_outer(bc_segbits, a_reg_segment);
-
-            if(b_reg == 0){
-
-              c_tmp = a_tmp.multiply(B.get_register(b_reg));
-            }
-            else{
-              c_tmp = a_tmp.multiply_add(B.get_register(b_reg), c_tmp);
-            }
-
-          }
-
-          C.get_register(c_reg) += c_tmp.segmented_sum_outer(bc_segbits, c_segment);
-
-        }
+        camp::idx_t b_reg = a_col * num_bc_reg_per_row + bc_col_reg;
 
+        C.get_register(c_reg) =
+            register_type(A.get(ac_row, a_col))
+                .multiply_add(B.get_register(b_reg), C.get_register(c_reg));
       }
+    }
+  }
 
-      RAJA_HOST_DEVICE
-      static
-      RAJA_INLINE
-      void multiply(left_type const &A, right_type const &B, result_type &C){
-        C = result_type(0);
-        multiply_accumulate(A, B, C);
-      }
-  };
-
-
-  /**
-   *
-   * Column-Major * Column-Major ==> Column-Major
+  /*
+   * Matrix B (and C) have less than one register per row
    *
    */
-  template<typename T, typename REGISTER_POLICY, camp::idx_t N_SIZE, camp::idx_t M_SIZE, camp::idx_t M2_SIZE, camp::idx_t O_SIZE>
-    struct MatrixMatrixMultiplyHelper<
-    RAJA::expt::TensorRegister<REGISTER_POLICY,
-                     T,
-                     RAJA::expt::ColMajorLayout,
-                     camp::idx_seq<N_SIZE, M_SIZE>>,
-                     RAJA::expt::TensorRegister<REGISTER_POLICY,
-                      T,
-                      RAJA::expt::ColMajorLayout,
-                      camp::idx_seq<M2_SIZE, O_SIZE>> >
-      {
-
-      using self_type = MatrixMatrixMultiplyHelper<
-          RAJA::expt::TensorRegister<REGISTER_POLICY,
-                         T,
-                         RAJA::expt::ColMajorLayout,
-                         camp::idx_seq<N_SIZE, M_SIZE>>,
-                         RAJA::expt::TensorRegister<REGISTER_POLICY,
-                          T,
-                          RAJA::expt::ColMajorLayout,
-                          camp::idx_seq<M2_SIZE, O_SIZE>> >;
-
-        static_assert(M_SIZE == M2_SIZE, "Matrices are not compatible for multiplication");
-
-        using left_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                         T,
-                                         RAJA::expt::ColMajorLayout,
-                                         camp::idx_seq<N_SIZE, M_SIZE>>;
-
-        using right_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                          T,
-                                          RAJA::expt::ColMajorLayout,
-                                          camp::idx_seq<M_SIZE, O_SIZE>> ;
-
-        using result_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                           T,
-                                           RAJA::expt::ColMajorLayout,
-                                           camp::idx_seq<N_SIZE, O_SIZE>> ;
-
-        using register_type = typename result_type::register_type;
-
-        static constexpr camp::idx_t s_elements_per_register = left_type::s_elements_per_register;
-        static constexpr camp::idx_t s_A_minor_dim_registers = left_type::s_minor_dim_registers;
-        static constexpr camp::idx_t s_B_minor_dim_registers = right_type::s_minor_dim_registers;
-        static constexpr camp::idx_t s_C_minor_dim_registers = result_type::s_minor_dim_registers;
-
-
-
-        /*
-         * Matrix A (and C) has 1 more more registers per column
-         *
-         */
-        template<typename dummy = void>
-        RAJA_HOST_DEVICE
-        static
-        RAJA_INLINE
-        typename std::enable_if<(s_C_minor_dim_registers != 0), dummy>::type
-        multiply_accumulate(left_type const &A, right_type const &B, result_type &C)
-        {
-
-  #if defined(RAJA_ENABLE_VECTOR_STATS) && !defined(__CUDA_ARCH__)
-          RAJA::tensor_stats::num_matrix_mm_multacc_row_row ++;
-  #endif
+  template <typename dummy = void>
+  RAJA_HOST_DEVICE RAJA_INLINE static
+      typename std::enable_if<(s_C_minor_dim_registers == 0), dummy>::type
+      multiply_accumulate(left_type const& A,
+                          right_type const& B,
+                          result_type& C)
+  {
+    constexpr camp::idx_t bc_segbits              = result_type::s_segbits;
+    constexpr camp::idx_t a_segments_per_register = 1 << bc_segbits;
+
+    RAJA_UNROLL
+    for (camp::idx_t ac_row = 0; ac_row < N_SIZE; ++ac_row)
+    {
+      camp::idx_t c_reg     = ac_row / result_type::s_major_dim_per_register;
+      camp::idx_t c_segment = ac_row % result_type::s_major_dim_per_register;
+      register_type c_tmp;
 
+      RAJA_UNROLL
+      for (camp::idx_t b_reg = 0; b_reg < right_type::s_num_registers; ++b_reg)
+      {
 
-          constexpr camp::idx_t num_ac_reg_per_col = s_C_minor_dim_registers;
+        camp::idx_t a_segment = ac_row * right_type::s_num_registers + b_reg;
+        camp::idx_t a_reg     = a_segment / a_segments_per_register;
+        camp::idx_t a_reg_segment = a_segment % a_segments_per_register;
 
-          RAJA_UNROLL
-          for(camp::idx_t c_reg = 0;c_reg < result_type::s_num_registers;++ c_reg){
-            camp::idx_t ac_row_reg = c_reg % num_ac_reg_per_col;
-            camp::idx_t bc_col = c_reg / num_ac_reg_per_col;
+        auto a_tmp = A.get_register(a_reg).segmented_broadcast_outer(
+            bc_segbits, a_reg_segment);
 
-            RAJA_UNROLL
-            for(camp::idx_t b_row = 0;b_row < M_SIZE;++ b_row){
-              camp::idx_t a_reg = b_row * num_ac_reg_per_col + ac_row_reg;
+        if (b_reg == 0)
+        {
 
-              C.get_register(c_reg) =
-                  register_type(B.get(b_row, bc_col)).multiply_add(
-                      A.get_register(a_reg),
-                      C.get_register(c_reg));
-            }
-          }
+          c_tmp = a_tmp.multiply(B.get_register(b_reg));
+        }
+        else
+        {
+          c_tmp = a_tmp.multiply_add(B.get_register(b_reg), c_tmp);
+        }
+      }
 
+      C.get_register(c_reg) += c_tmp.segmented_sum_outer(bc_segbits, c_segment);
+    }
+  }
 
-        }
+  RAJA_HOST_DEVICE
+  static RAJA_INLINE void
+  multiply(left_type const& A, right_type const& B, result_type& C)
+  {
+    C = result_type(0);
+    multiply_accumulate(A, B, C);
+  }
+};
 
-        /*
-         * Matrix A (and C) have less than one register per column
-         *
-         */
-        template<typename dummy = void>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        static
-        typename std::enable_if<(s_C_minor_dim_registers == 0), dummy>::type
-        multiply_accumulate(left_type const &A, right_type const &B, result_type &C)
-        {
-          constexpr camp::idx_t ac_segbits = result_type::s_segbits;
-          constexpr camp::idx_t b_segments_per_register = 1<<ac_segbits;
 
-          camp::idx_t bc_col = 0;
+/**
+ *
+ * Column-Major * Column-Major ==> Column-Major
+ *
+ */
+template <typename T,
+          typename REGISTER_POLICY,
+          camp::idx_t N_SIZE,
+          camp::idx_t M_SIZE,
+          camp::idx_t M2_SIZE,
+          camp::idx_t O_SIZE>
+struct MatrixMatrixMultiplyHelper<
+    RAJA::expt::TensorRegister<REGISTER_POLICY,
+                               T,
+                               RAJA::expt::ColMajorLayout,
+                               camp::idx_seq<N_SIZE, M_SIZE>>,
+    RAJA::expt::TensorRegister<REGISTER_POLICY,
+                               T,
+                               RAJA::expt::ColMajorLayout,
+                               camp::idx_seq<M2_SIZE, O_SIZE>>>
+{
 
-          RAJA_UNROLL
-          for(camp::idx_t c_reg = 0;c_reg < N_SIZE/result_type::s_major_dim_per_register;++ c_reg){
+  using self_type = MatrixMatrixMultiplyHelper<
+      RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                 T,
+                                 RAJA::expt::ColMajorLayout,
+                                 camp::idx_seq<N_SIZE, M_SIZE>>,
+      RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                 T,
+                                 RAJA::expt::ColMajorLayout,
+                                 camp::idx_seq<M2_SIZE, O_SIZE>>>;
+
+  static_assert(M_SIZE == M2_SIZE,
+                "Matrices are not compatible for multiplication");
+
+  using left_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                               T,
+                                               RAJA::expt::ColMajorLayout,
+                                               camp::idx_seq<N_SIZE, M_SIZE>>;
+
+  using right_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                                T,
+                                                RAJA::expt::ColMajorLayout,
+                                                camp::idx_seq<M_SIZE, O_SIZE>>;
+
+  using result_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                                 T,
+                                                 RAJA::expt::ColMajorLayout,
+                                                 camp::idx_seq<N_SIZE, O_SIZE>>;
+
+  using register_type = typename result_type::register_type;
+
+  static constexpr camp::idx_t s_elements_per_register =
+      left_type::s_elements_per_register;
+  static constexpr camp::idx_t s_A_minor_dim_registers =
+      left_type::s_minor_dim_registers;
+  static constexpr camp::idx_t s_B_minor_dim_registers =
+      right_type::s_minor_dim_registers;
+  static constexpr camp::idx_t s_C_minor_dim_registers =
+      result_type::s_minor_dim_registers;
+
+
+  /*
+   * Matrix A (and C) has 1 more more registers per column
+   *
+   */
+  template <typename dummy = void>
+  RAJA_HOST_DEVICE static RAJA_INLINE
+      typename std::enable_if<(s_C_minor_dim_registers != 0), dummy>::type
+      multiply_accumulate(left_type const& A,
+                          right_type const& B,
+                          result_type& C)
+  {
 
-            RAJA_UNROLL
-            for(camp::idx_t c_segment = 0;c_segment < result_type::s_major_dim_per_register;++ c_segment){
+#if defined(RAJA_ENABLE_VECTOR_STATS) && !defined(__CUDA_ARCH__)
+    RAJA::tensor_stats::num_matrix_mm_multacc_row_row++;
+#endif
 
-              register_type c_tmp;
 
-              RAJA_UNROLL
-              for(camp::idx_t a_reg = 0;a_reg < right_type::s_num_registers;++ a_reg){
+    constexpr camp::idx_t num_ac_reg_per_col = s_C_minor_dim_registers;
 
+    RAJA_UNROLL
+    for (camp::idx_t c_reg = 0; c_reg < result_type::s_num_registers; ++c_reg)
+    {
+      camp::idx_t ac_row_reg = c_reg % num_ac_reg_per_col;
+      camp::idx_t bc_col     = c_reg / num_ac_reg_per_col;
 
-                camp::idx_t b_segment = bc_col*right_type::s_num_registers + a_reg;
-                camp::idx_t b_reg = b_segment / b_segments_per_register;
-                camp::idx_t b_reg_segment = b_segment % b_segments_per_register;
+      RAJA_UNROLL
+      for (camp::idx_t b_row = 0; b_row < M_SIZE; ++b_row)
+      {
+        camp::idx_t a_reg = b_row * num_ac_reg_per_col + ac_row_reg;
 
-                register_type b_tmp = B.get_register(b_reg).segmented_broadcast_outer(ac_segbits, b_reg_segment);
+        C.get_register(c_reg) =
+            register_type(B.get(b_row, bc_col))
+                .multiply_add(A.get_register(a_reg), C.get_register(c_reg));
+      }
+    }
+  }
 
-                if(a_reg == 0){
-                  c_tmp = b_tmp.multiply(A.get_register(a_reg));
-                }
-                else{
-                  c_tmp = b_tmp.multiply_add(A.get_register(a_reg), c_tmp);
-                }
+  /*
+   * Matrix A (and C) have less than one register per column
+   *
+   */
+  template <typename dummy = void>
+  RAJA_HOST_DEVICE RAJA_INLINE static
+      typename std::enable_if<(s_C_minor_dim_registers == 0), dummy>::type
+      multiply_accumulate(left_type const& A,
+                          right_type const& B,
+                          result_type& C)
+  {
+    constexpr camp::idx_t ac_segbits              = result_type::s_segbits;
+    constexpr camp::idx_t b_segments_per_register = 1 << ac_segbits;
+
+    camp::idx_t bc_col = 0;
+
+    RAJA_UNROLL
+    for (camp::idx_t c_reg = 0;
+         c_reg < N_SIZE / result_type::s_major_dim_per_register; ++c_reg)
+    {
 
-              }
+      RAJA_UNROLL
+      for (camp::idx_t c_segment = 0;
+           c_segment < result_type::s_major_dim_per_register; ++c_segment)
+      {
 
-              C.get_register(c_reg) += c_tmp.segmented_sum_outer(ac_segbits, c_segment);
+        register_type c_tmp;
 
-              ++ bc_col;
-            } // c_segment
-          } // c_reg
+        RAJA_UNROLL
+        for (camp::idx_t a_reg = 0; a_reg < right_type::s_num_registers;
+             ++a_reg)
+        {
 
 
-        }
+          camp::idx_t b_segment = bc_col * right_type::s_num_registers + a_reg;
+          camp::idx_t b_reg     = b_segment / b_segments_per_register;
+          camp::idx_t b_reg_segment = b_segment % b_segments_per_register;
 
+          register_type b_tmp = B.get_register(b_reg).segmented_broadcast_outer(
+              ac_segbits, b_reg_segment);
 
-        RAJA_HOST_DEVICE
-        static
-        RAJA_INLINE
-        void multiply(left_type const &A, right_type const &B, result_type &C){
-          C = result_type(0);
-          self_type::multiply_accumulate(A, B, C);
+          if (a_reg == 0)
+          {
+            c_tmp = b_tmp.multiply(A.get_register(a_reg));
+          }
+          else
+          {
+            c_tmp = b_tmp.multiply_add(A.get_register(a_reg), c_tmp);
+          }
         }
-    };
 
+        C.get_register(c_reg) +=
+            c_tmp.segmented_sum_outer(ac_segbits, c_segment);
 
+        ++bc_col;
+      }  // c_segment
+    }    // c_reg
+  }
 
 
-} // namespace expt
-} // namespace internal
-} // namespace RAJA
+  RAJA_HOST_DEVICE
+  static RAJA_INLINE void
+  multiply(left_type const& A, right_type const& B, result_type& C)
+  {
+    C = result_type(0);
+    self_type::multiply_accumulate(A, B, C);
+  }
+};
 
 
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp b/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
index 3036a096b5..3134421735 100644
--- a/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
+++ b/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
@@ -24,7 +24,7 @@
 #include "RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp"
 #include "RAJA/util/BitMask.hpp"
 
-//#define DEBUG_MATRIX_LOAD_STORE
+// #define DEBUG_MATRIX_LOAD_STORE
 
 
 namespace RAJA
@@ -32,1121 +32,1342 @@ namespace RAJA
 namespace expt
 {
 
-  /*
-   * 2D (Matrix) specialization of TensorRegister
-   */
-  template<typename REGISTER_POLICY, typename T, camp::idx_t ROW_ORD, camp::idx_t COL_ORD, camp::idx_t ROW_SIZE, camp::idx_t COL_SIZE>
-  class TensorRegister<REGISTER_POLICY, T, TensorLayout<ROW_ORD, COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>> :
-    public RAJA::internal::expt::TensorRegisterBase<TensorRegister<REGISTER_POLICY, T, TensorLayout<ROW_ORD, COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>>>
+/*
+ * 2D (Matrix) specialization of TensorRegister
+ */
+template <typename REGISTER_POLICY,
+          typename T,
+          camp::idx_t ROW_ORD,
+          camp::idx_t COL_ORD,
+          camp::idx_t ROW_SIZE,
+          camp::idx_t COL_SIZE>
+class TensorRegister<REGISTER_POLICY,
+                     T,
+                     TensorLayout<ROW_ORD, COL_ORD>,
+                     camp::idx_seq<ROW_SIZE, COL_SIZE>>
+    : public RAJA::internal::expt::TensorRegisterBase<
+          TensorRegister<REGISTER_POLICY,
+                         T,
+                         TensorLayout<ROW_ORD, COL_ORD>,
+                         camp::idx_seq<ROW_SIZE, COL_SIZE>>>
+{
+public:
+  using self_type = TensorRegister<REGISTER_POLICY,
+                                   T,
+                                   TensorLayout<ROW_ORD, COL_ORD>,
+                                   camp::idx_seq<ROW_SIZE, COL_SIZE>>;
+  using base_type = RAJA::internal::expt::TensorRegisterBase<
+      TensorRegister<REGISTER_POLICY,
+                     T,
+                     TensorLayout<ROW_ORD, COL_ORD>,
+                     camp::idx_seq<ROW_SIZE, COL_SIZE>>>;
+  using register_type      = Register<T, REGISTER_POLICY>;
+  using row_vector_type    = VectorRegister<T, REGISTER_POLICY, COL_SIZE>;
+  using column_vector_type = VectorRegister<T, REGISTER_POLICY, ROW_SIZE>;
+  using register_policy    = REGISTER_POLICY;
+  using element_type       = T;
+  using layout_type        = TensorLayout<ROW_ORD, COL_ORD>;
+
+  using transpose_tensor_type =
+      TensorRegister<REGISTER_POLICY,
+                     T,
+                     TensorLayout<!ROW_ORD, !COL_ORD>,
+                     camp::idx_seq<ROW_SIZE, COL_SIZE>>;
+
+  using transpose_type = TensorRegister<REGISTER_POLICY,
+                                        T,
+                                        layout_type,
+                                        camp::idx_seq<COL_SIZE, ROW_SIZE>>;
+  using product_type   = TensorRegister<REGISTER_POLICY,
+                                      T,
+                                      layout_type,
+                                      camp::idx_seq<ROW_SIZE, ROW_SIZE>>;
+
+  static constexpr camp::idx_t s_num_rows    = ROW_SIZE;
+  static constexpr camp::idx_t s_num_columns = COL_SIZE;
+
+
+  static constexpr camp::idx_t s_elements_per_register =
+      RAJA::internal::expt::RegisterTraits<REGISTER_POLICY, T>::s_num_elem;
+
+  // number of registers to hold entire matrix
+  static constexpr camp::idx_t s_num_registers =
+      (ROW_SIZE * COL_SIZE) / s_elements_per_register;
+
+  // We only allow matrix sizes that exactly fit in some number of registers
+  static_assert((ROW_SIZE * COL_SIZE) ==
+                    s_num_registers * s_elements_per_register,
+                "MatrixRegister must be dimensioned to exactly fit an integer "
+                "number of registers");
+
+  using log_base2_t = RAJA::LogBase2<s_elements_per_register>;
+
+  static constexpr camp::idx_t s_shift_per_register = log_base2_t::value;
+
+  static constexpr camp::idx_t s_mask_per_register =
+      (1 << log_base2_t::value) - 1;
+
+
+  static constexpr camp::idx_t s_minor_dim_elements =
+      layout_type::is_row_major() ? s_num_columns : s_num_rows;
+
+  static constexpr camp::idx_t s_major_dim_elements =
+      layout_type::is_row_major() ? s_num_rows : s_num_columns;
+
+  // number of (full) registers that span the minor dim
+  // if a single register is split across multiple rows or columns, then
+  // this is 0
+  static constexpr camp::idx_t s_minor_dim_registers =
+      s_minor_dim_elements / s_elements_per_register;
+
+  static_assert(s_minor_dim_registers > 0 || log_base2_t::is_exact,
+                "Minor dimension smaller than a vector need to be a power of "
+                "two fraction");
+
+  static_assert(s_minor_dim_registers == 0 ||
+                    (s_minor_dim_elements % s_elements_per_register == 0),
+                "Minor dimensions greater than a vector length must be an "
+                "integer number of vectors");
+
+
+  static constexpr camp::idx_t s_major_dim_per_register =
+      s_elements_per_register / s_minor_dim_elements;
+
+  static constexpr camp::idx_t s_segbits =
+      RAJA::LogBase2<s_minor_dim_elements>::value;
+
+private:
+  template <typename IDX>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto to_register(IDX row,
+                                                                 IDX col) -> IDX
   {
-    public:
-      using self_type = TensorRegister<REGISTER_POLICY, T, TensorLayout<ROW_ORD, COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>>;
-      using base_type = RAJA::internal::expt::TensorRegisterBase<TensorRegister<REGISTER_POLICY, T, TensorLayout<ROW_ORD, COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>>>;
-      using register_type = Register<T, REGISTER_POLICY>;
-      using row_vector_type = VectorRegister<T, REGISTER_POLICY, COL_SIZE>;
-      using column_vector_type = VectorRegister<T, REGISTER_POLICY, ROW_SIZE>;
-      using register_policy = REGISTER_POLICY;
-      using element_type = T;
-      using layout_type = TensorLayout<ROW_ORD, COL_ORD>;
-
-      using transpose_tensor_type = TensorRegister<REGISTER_POLICY, T, TensorLayout<!ROW_ORD, !COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>>;
-
-      using transpose_type = TensorRegister<REGISTER_POLICY, T, layout_type, camp::idx_seq<COL_SIZE, ROW_SIZE>>;
-      using product_type = TensorRegister<REGISTER_POLICY, T, layout_type, camp::idx_seq<ROW_SIZE, ROW_SIZE>>;
-
-      static constexpr camp::idx_t s_num_rows = ROW_SIZE;
-      static constexpr camp::idx_t s_num_columns = COL_SIZE;
-
-
+    return layout_type::is_row_major()
+               ? (row * IDX(COL_SIZE) + col) >> IDX(s_shift_per_register)
+               : (col * IDX(ROW_SIZE) + row) >> IDX(s_shift_per_register);
+  }
+
+  template <typename IDX>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto to_lane(IDX row, IDX col)
+      -> IDX
+  {
+    return layout_type::is_row_major()
+               ? (row * IDX(COL_SIZE) + col) & IDX(s_mask_per_register)
+               : (col * IDX(ROW_SIZE) + row) & IDX(s_mask_per_register);
+  }
 
+  using base_type::m_registers;
 
-      static constexpr camp::idx_t s_elements_per_register =
-          RAJA::internal::expt::RegisterTraits<REGISTER_POLICY,T>::s_num_elem;
+public:
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr TensorRegister() : base_type() {}
 
-      // number of registers to hold entire matrix
-      static constexpr camp::idx_t s_num_registers =
-          (ROW_SIZE*COL_SIZE) / s_elements_per_register;
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  TensorRegister(element_type c) : base_type(c) { this->broadcast(c); }
 
-      // We only allow matrix sizes that exactly fit in some number of registers
-      static_assert((ROW_SIZE*COL_SIZE) == s_num_registers*s_elements_per_register,
-          "MatrixRegister must be dimensioned to exactly fit an integer number of registers");
 
-      using log_base2_t = RAJA::LogBase2<s_elements_per_register>;
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorRegister(self_type const& c) : base_type(c) { this->copy(c); }
 
-      static constexpr camp::idx_t s_shift_per_register =
-          log_base2_t::value;
 
-      static constexpr camp::idx_t s_mask_per_register =
-          (1<<log_base2_t::value)-1;
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  ~TensorRegister() {}
 
 
-      static constexpr camp::idx_t s_minor_dim_elements =
-          layout_type::is_row_major() ? s_num_columns : s_num_rows;
+  /*!
+   * Returns true if the underlying data packed for a given tensor ref
+   *
+   * This is true if either:
+   *   It's column major and the rows are stride one
+   *   It's row major and the columns are stride one
+   */
+  template <camp::idx_t STRIDE_ONE_DIM>
+  RAJA_HOST_DEVICE RAJA_INLINE static constexpr bool is_ref_packed()
+  {
+    return (STRIDE_ONE_DIM == 0 && layout_type::is_column_major()) ||
+           (STRIDE_ONE_DIM == 1 && layout_type::is_row_major());
+  }
 
-      static constexpr camp::idx_t s_major_dim_elements =
-          layout_type::is_row_major() ? s_num_rows : s_num_columns;
+  /*!
+   * Gets the maximum size of matrix along specified dimension
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr camp::idx_t s_dim_elem(camp::idx_t dim)
+  {
+    return dim == 0 ? ROW_SIZE : COL_SIZE;
+  }
 
-      // number of (full) registers that span the minor dim
-      // if a single register is split across multiple rows or columns, then
-      // this is 0
-      static constexpr camp::idx_t s_minor_dim_registers =
-              s_minor_dim_elements / s_elements_per_register;
 
-      static_assert(s_minor_dim_registers >0  ||  log_base2_t::is_exact,
-          "Minor dimension smaller than a vector need to be a power of two fraction");
+  /*!
+   * @brief Set entire vector to a single scalar value
+   * @param value Value to set all vector elements to
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator=(element_type value)
+  {
+    this->broadcast(value);
+    return *this;
+  }
 
-      static_assert(s_minor_dim_registers == 0 || (s_minor_dim_elements % s_elements_per_register == 0),
-          "Minor dimensions greater than a vector length must be an integer number of vectors");
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator=(self_type const& c) { return this->copy(c); }
 
 
-      static constexpr camp::idx_t s_major_dim_per_register =
-          s_elements_per_register / s_minor_dim_elements;
+  /*!
+   * Provide matrix-matrix multiply for operator* between to matrices
+   */
+  template <typename T2, typename L, typename RP>
+  self_type operator*(SquareMatrixRegister<T2, L, RP> const& y) const
+  {
+    return matrix_multiply(y);
+  }
 
-      static constexpr camp::idx_t s_segbits = RAJA::LogBase2<s_minor_dim_elements>::value;
+  /*!
+   * Provide right matrix-vector multiply for operator* between this
+   * matrix and a vector.
+   */
+  template <typename T2, typename RP>
+  VectorRegister<T2, RP> operator*(VectorRegister<T2, RP> const& y) const
+  {
+    return right_multiply_vector(y);
+  }
 
-    private:
 
-      template<typename IDX>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      static
-      auto to_register(IDX row, IDX col) -> IDX {
-        return layout_type::is_row_major() ?
-            (row*IDX(COL_SIZE) + col) >> IDX(s_shift_per_register) :
-            (col*IDX(ROW_SIZE) + row) >> IDX(s_shift_per_register);
-      }
+  template <typename REF_TYPE>
+  struct RefBridge;
 
-      template<typename IDX>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      static
-      auto to_lane(IDX row, IDX col) -> IDX {
-        return layout_type::is_row_major() ?
-            (row*IDX(COL_SIZE) + col) & IDX(s_mask_per_register) :
-            (col*IDX(ROW_SIZE) + row) & IDX(s_mask_per_register);
-      }
+  template <typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& load_ref(REF_TYPE const& ref)
+  {
+    RefBridge<REF_TYPE>::load_ref(*this, ref);
+    return *this;
+  }
 
-      using base_type::m_registers;
+  template <typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type const& store_ref(REF_TYPE& ref) const
+  {
+    RefBridge<REF_TYPE>::store_ref(*this, ref);
+    return *this;
+  }
+
+
+  template <typename POINTER_TYPE,
+            typename INDEX_TYPE,
+            RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
+            camp::idx_t STRIDE_ONE_DIM>
+  struct RefBridge<
+      RAJA::internal::expt::
+          TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM>>
+  {
 
-    public:
+    using RefType = RAJA::internal::expt::
+        TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM>;
 
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void load_ref(self_type& self, RefType const& ref)
+    {
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      TensorRegister() : base_type() {}
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0] +
+                 ref.m_tile.m_begin[1] * ref.m_stride[1];
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      TensorRegister(element_type c) : base_type(c)
+      // check for packed data
+      if (self.is_ref_packed<STRIDE_ONE_DIM>())
       {
-        this->broadcast(c);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.load_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.load_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                              ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+        }
       }
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      TensorRegister(self_type const &c) : base_type(c)
+      // strided data
+      else
       {
-        this->copy(c);
-      }
-
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      ~TensorRegister(){}
-
-
-      /*!
-       * Returns true if the underlying data packed for a given tensor ref
-       *
-       * This is true if either:
-       *   It's column major and the rows are stride one
-       *   It's row major and the columns are stride one
-       */
-      template<camp::idx_t STRIDE_ONE_DIM>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_ref_packed() {
-        return (STRIDE_ONE_DIM == 0 && layout_type::is_column_major()) ||
-            (STRIDE_ONE_DIM == 1 && layout_type::is_row_major());
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.load_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.load_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                               ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+        }
       }
+    }
 
-      /*!
-       * Gets the maximum size of matrix along specified dimension
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr camp::idx_t s_dim_elem(camp::idx_t dim){
-        return dim == 0 ? ROW_SIZE : COL_SIZE;
-      }
 
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void store_ref(self_type const& self, RefType& ref)
+    {
 
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0] +
+                 ref.m_tile.m_begin[1] * ref.m_stride[1];
 
-      /*!
-       * @brief Set entire vector to a single scalar value
-       * @param value Value to set all vector elements to
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(element_type value)
+      // check for packed data
+      if (self.is_ref_packed<STRIDE_ONE_DIM>())
       {
-        this->broadcast(value);
-        return *this;
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.store_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.store_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                               ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+        }
       }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        return this->copy(c);
+      // strided data
+      else
+      {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.store_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.store_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                                ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+        }
       }
+    }
+  };
+
+
+  template <typename POINTER_TYPE,
+            typename INDEX_TYPE,
+            RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
+            INDEX_TYPE StrideInt1,
+            INDEX_TYPE StrideInt2,
+            INDEX_TYPE BeginInt1,
+            INDEX_TYPE BeginInt2,
+            INDEX_TYPE SizeInt1,
+            INDEX_TYPE SizeInt2,
+            camp::idx_t STRIDE_ONE_DIM>
+  struct RefBridge<RAJA::internal::expt::StaticTensorRef<
+      POINTER_TYPE,
+      INDEX_TYPE,
+      TENSOR_SIZE,
+      camp::int_seq<INDEX_TYPE, StrideInt1, StrideInt2>,
+      camp::int_seq<INDEX_TYPE, BeginInt1, BeginInt2>,
+      camp::int_seq<INDEX_TYPE, SizeInt1, SizeInt2>,
+      STRIDE_ONE_DIM>>
+  {
 
-
-      /*!
-       * Provide matrix-matrix multiply for operator* between to matrices
-       */
-      template<typename T2, typename L, typename RP>
-      self_type
-      operator*(SquareMatrixRegister<T2, L, RP> const &y) const
+    using RefType = RAJA::internal::expt::StaticTensorRef<
+        POINTER_TYPE,
+        INDEX_TYPE,
+        TENSOR_SIZE,
+        camp::int_seq<INDEX_TYPE, StrideInt1, StrideInt2>,
+        camp::int_seq<INDEX_TYPE, BeginInt1, BeginInt2>,
+        camp::int_seq<INDEX_TYPE, SizeInt1, SizeInt2>,
+        STRIDE_ONE_DIM>;
+
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void load_ref(self_type& self, RefType const& ref)
+    {
+
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0] +
+                 ref.m_tile.m_begin[1] * ref.m_stride[1];
+
+      // check for packed data
+      if (self.is_ref_packed<STRIDE_ONE_DIM>())
       {
-        return matrix_multiply(y);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.load_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.load_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                              ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+        }
       }
-
-      /*!
-       * Provide right matrix-vector multiply for operator* between this
-       * matrix and a vector.
-       */
-      template<typename T2, typename RP>
-      VectorRegister<T2, RP>
-      operator*(VectorRegister<T2, RP> const &y) const
+      // strided data
+      else
       {
-        return right_multiply_vector(y);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.load_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.load_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                               ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+        }
       }
+    }
 
 
-      template<typename REF_TYPE>
-      struct RefBridge;
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void store_ref(self_type const& self, RefType& ref)
+    {
 
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type& load_ref (REF_TYPE const &ref){
-          RefBridge<REF_TYPE>::load_ref(*this,ref);
-          return *this;
-      }
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0] +
+                 ref.m_tile.m_begin[1] * ref.m_stride[1];
 
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_ref (REF_TYPE &ref) const {
-          RefBridge<REF_TYPE>::store_ref(*this,ref);
-          return *this;
+      // check for packed data
+      if (self.is_ref_packed<STRIDE_ONE_DIM>())
+      {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.store_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.store_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                               ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+        }
       }
-
-
-
-      template<typename POINTER_TYPE, typename INDEX_TYPE, RAJA::internal::expt::TensorTileSize TENSOR_SIZE, camp::idx_t STRIDE_ONE_DIM>
-      struct RefBridge <RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM>>
+      // strided data
+      else
       {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.store_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.store_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                                ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+        }
+      }
+    }
+  };
+
+
+  /*!
+   * Loads a dense full matrix from memory.
+   *
+   * For row-major, column entries must be stride-1
+   * For column-major, row entries must be stride-1
+   *
+   * Non-stride-1 dimension can have any striding... so this is can
+   * be a "semi-dense" matrix.
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type&
+  load_packed(element_type const* ptr, int row_stride, int col_stride)
+  {
+    // if it's dense in columns and rows, just do a dense load
+    if ((layout_type::is_row_major() && (row_stride == COL_SIZE)) ||
+        (layout_type::is_column_major() && (col_stride == ROW_SIZE)))
+    {
 
-          using RefType = RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM>;
-
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static void load_ref(self_type& self, RefType const &ref){
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
-                                       ref.m_tile.m_begin[1]*ref.m_stride[1];
-    
-            // check for packed data
-            if(self.is_ref_packed<STRIDE_ONE_DIM>()){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.load_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.load_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                    ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.load_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.load_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-            }
-          }
-    
-    
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static void store_ref(self_type const &self, RefType &ref) {
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
-                                       ref.m_tile.m_begin[1]*ref.m_stride[1];
-    
-            // check for packed data
-            if(self.is_ref_packed<STRIDE_ONE_DIM>())
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.store_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.store_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.store_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.store_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-            }
-          }
-
-      };
-
-
-
-
-      template<
-           typename POINTER_TYPE,
-           typename INDEX_TYPE,
-           RAJA::internal::expt::TensorTileSize TENSOR_SIZE, 
-           INDEX_TYPE StrideInt1, INDEX_TYPE StrideInt2,
-           INDEX_TYPE  BeginInt1, INDEX_TYPE  BeginInt2,
-           INDEX_TYPE   SizeInt1, INDEX_TYPE   SizeInt2,
-           camp::idx_t STRIDE_ONE_DIM
-      >
-      struct RefBridge
-      <RAJA::internal::expt::StaticTensorRef<POINTER_TYPE,INDEX_TYPE,TENSOR_SIZE,camp::int_seq<INDEX_TYPE,StrideInt1,StrideInt2>,camp::int_seq<INDEX_TYPE,BeginInt1,BeginInt2>,camp::int_seq<INDEX_TYPE,SizeInt1,SizeInt2>,STRIDE_ONE_DIM>>
+      for (camp::idx_t reg = 0; reg < s_num_registers; ++reg)
       {
+        m_registers[reg].load_packed(ptr + reg * s_elements_per_register);
+      }
+    }
+    // Do semi-dense load for row-major
+    else if (layout_type::is_row_major())
+    {
 
-          using RefType = RAJA::internal::expt::StaticTensorRef<POINTER_TYPE,INDEX_TYPE,TENSOR_SIZE,camp::int_seq<INDEX_TYPE,StrideInt1,StrideInt2>,camp::int_seq<INDEX_TYPE,BeginInt1,BeginInt2>,camp::int_seq<INDEX_TYPE,SizeInt1,SizeInt2>,STRIDE_ONE_DIM>;
-
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static void load_ref(self_type& self, RefType const &ref){
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
-                                       ref.m_tile.m_begin[1]*ref.m_stride[1];
-    
-            // check for packed data
-            if(self.is_ref_packed<STRIDE_ONE_DIM>()){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.load_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.load_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                    ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.load_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.load_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-            }
-          }
-    
-    
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static void store_ref(self_type const &self, RefType &ref) {
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
-                                       ref.m_tile.m_begin[1]*ref.m_stride[1];
-    
-            // check for packed data
-            if(self.is_ref_packed<STRIDE_ONE_DIM>())
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.store_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.store_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.store_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.store_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-            }
-          }
-
-      };
-
-
-
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
+        camp::idx_t reg = 0;
+        for (camp::idx_t row = 0; row < ROW_SIZE; ++row)
+        {
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
+          {
 
+            camp::idx_t offset =
+                row * row_stride + colreg * s_elements_per_register;
 
-      /*!
-       * Loads a dense full matrix from memory.
-       *
-       * For row-major, column entries must be stride-1
-       * For column-major, row entries must be stride-1
-       *
-       * Non-stride-1 dimension can have any striding... so this is can
-       * be a "semi-dense" matrix.
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr,
-          int row_stride, int col_stride)
-      {
-        // if it's dense in columns and rows, just do a dense load
-        if((layout_type::is_row_major()&&(row_stride==COL_SIZE)) ||
-           (layout_type::is_column_major()&&(col_stride==ROW_SIZE))){
+            m_registers[reg].load_packed(ptr + offset);
 
-          for(camp::idx_t reg = 0;reg < s_num_registers;++ reg){
-            m_registers[reg].load_packed(ptr + reg*s_elements_per_register);
+            reg++;
           }
-
         }
-        // Do semi-dense load for row-major
-        else if(layout_type::is_row_major()){
-
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            camp::idx_t reg = 0;
-            for(camp::idx_t row = 0;row < ROW_SIZE;++ row){
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers; ++ colreg){
+      }
+      // more than one column per register
+      else
+      {
+        // default to strided operation
+        return load_strided(ptr, row_stride, col_stride);
+      }
+    }
+    // Do semi-dense load for column-major
+    else
+    {
+      // one or more registers per row
+      if (s_minor_dim_registers)
+      {
 
-                camp::idx_t offset = row*row_stride + colreg*s_elements_per_register;
+        camp::idx_t reg = 0;
+        for (camp::idx_t col = 0; col < COL_SIZE; ++col)
+        {
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
+          {
 
-                m_registers[reg].load_packed(ptr + offset);
+            camp::idx_t offset =
+                col * col_stride + rowreg * s_elements_per_register;
 
-                reg ++;
+            m_registers[reg].load_packed(ptr + offset);
 
-              }
-            }
-          }
-          // more than one column per register
-          else{
-            // default to strided operation
-            return load_strided(ptr, row_stride, col_stride);
+            reg++;
           }
         }
-        // Do semi-dense load for column-major
-        else{
-          // one or more registers per row
-          if(s_minor_dim_registers){
-
-            camp::idx_t reg = 0;
-            for(camp::idx_t col = 0;col < COL_SIZE;++ col){
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers; ++ rowreg){
-
-                camp::idx_t offset = col*col_stride + rowreg*s_elements_per_register;
+      }
+      // more than one column per register
+      else
+      {
+        // default to strided operation
+        return load_strided(ptr, row_stride, col_stride);
+      }
+    }
 
-                m_registers[reg].load_packed(ptr + offset);
+    return *this;
+  }
 
-                reg ++;
+  /*!
+   * Loads a strided full matrix from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type&
+  load_strided(element_type const* ptr, int row_stride, int col_stride)
+  {
 
-              }
-            }
-          }
-          // more than one column per register
-          else{
-            // default to strided operation
-            return load_strided(ptr, row_stride, col_stride);
-          }
+    if (layout_type::is_row_major())
+    {
+      // one or more registers per row
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t col =
+              s_elements_per_register * (i - (row * s_minor_dim_registers));
+          m_registers[i].load_strided(ptr + row * row_stride + col * col_stride,
+                                      col_stride);
         }
-
-        return *this;
       }
-
-      /*!
-       * Loads a strided full matrix from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr,
-          int row_stride, int col_stride)
+      // less than one register per row
+      else
       {
-
-        if(layout_type::is_row_major()){
-          // one or more registers per row
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
-              m_registers[i].load_strided(ptr+row*row_stride+col*col_stride, col_stride);
-            }
-          }
-          // less than one register per row
-          else
-          {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              element_type const *ptr_i = ptr + i * row_stride*s_major_dim_per_register;
-              m_registers[i].segmented_load(ptr_i, s_segbits, col_stride, row_stride);
-            }
-          }
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          element_type const* ptr_i =
+              ptr + i * row_stride * s_major_dim_per_register;
+          m_registers[i].segmented_load(ptr_i, s_segbits, col_stride,
+                                        row_stride);
         }
+      }
+    }
 
-        // column major
-        else{
-
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
+    // column major
+    else
+    {
 
-              m_registers[i].load_strided(ptr+row*row_stride+col*col_stride, row_stride);
-            }
-          }
-          // less than one register per column
-          else
-          {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              element_type const *ptr_i = ptr + i * col_stride*s_major_dim_per_register;
-              m_registers[i].segmented_load(ptr_i, s_segbits, row_stride, col_stride);
-            }
-          }
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t row =
+              s_elements_per_register * (i - (col * s_minor_dim_registers));
+
+          m_registers[i].load_strided(ptr + row * row_stride + col * col_stride,
+                                      row_stride);
         }
-
-        return *this;
       }
-
-      /*!
-       * Loads a dense partial matrix from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed_nm(element_type const *ptr,
-          int row_stride, int col_stride,
-          int num_rows, int num_cols)
+      // less than one register per column
+      else
       {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          element_type const* ptr_i =
+              ptr + i * col_stride * s_major_dim_per_register;
+          m_registers[i].segmented_load(ptr_i, s_segbits, row_stride,
+                                        col_stride);
+        }
+      }
+    }
 
-        if(layout_type::is_row_major()){
-
-          // one or more registers per column
-          if(s_minor_dim_registers){
+    return *this;
+  }
 
-            for(camp::idx_t row = 0;row < num_rows;++ row){
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers; ++ colreg){
+  /*!
+   * Loads a dense partial matrix from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_packed_nm(element_type const* ptr,
+                            int row_stride,
+                            int col_stride,
+                            int num_rows,
+                            int num_cols)
+  {
 
-                camp::idx_t reg = row*s_minor_dim_registers + colreg;
+    if (layout_type::is_row_major())
+    {
 
-                camp::idx_t col0 = colreg*s_elements_per_register;
-                camp::idx_t offset = row*row_stride + col0;
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
 
-                // loading a complete register
-                if(col0+s_elements_per_register <= num_cols){
-                  m_registers[reg].load_packed(ptr + offset);
-                }
+        for (camp::idx_t row = 0; row < num_rows; ++row)
+        {
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
+          {
 
-                // partial register at end of row
-                else{
-                  m_registers[reg].load_packed_n(ptr + offset, num_cols - col0);
+            camp::idx_t reg = row * s_minor_dim_registers + colreg;
 
-                  // zero out the remaining registers, if any
-                  for(camp::idx_t i = colreg+1;i < s_minor_dim_registers;++i){
-                    reg++;
-                    m_registers[reg] = element_type(0);
-                  }
+            camp::idx_t col0   = colreg * s_elements_per_register;
+            camp::idx_t offset = row * row_stride + col0;
 
-                  break; // end this row
-                }
-              }
+            // loading a complete register
+            if (col0 + s_elements_per_register <= num_cols)
+            {
+              m_registers[reg].load_packed(ptr + offset);
             }
 
-            // zero out remaining rows
-            for(camp::idx_t row = num_rows;row < ROW_SIZE;++ row){
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers; ++ colreg){
-
-                camp::idx_t reg = row*s_minor_dim_registers + colreg;
+            // partial register at end of row
+            else
+            {
+              m_registers[reg].load_packed_n(ptr + offset, num_cols - col0);
 
+              // zero out the remaining registers, if any
+              for (camp::idx_t i = colreg + 1; i < s_minor_dim_registers; ++i)
+              {
+                reg++;
                 m_registers[reg] = element_type(0);
               }
+
+              break;  // end this row
             }
           }
-          // more than one column per register
-          else{
-            // default to strided operation
-            return load_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
-          }
         }
-        // Do semi-dense load for column-major
-        else{
 
-          // one or more registers per column
-          if(s_minor_dim_registers){
+        // zero out remaining rows
+        for (camp::idx_t row = num_rows; row < ROW_SIZE; ++row)
+        {
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
+          {
 
-            for(camp::idx_t col = 0;col < num_cols;++ col){
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers; ++ rowreg){
+            camp::idx_t reg = row * s_minor_dim_registers + colreg;
 
-                camp::idx_t reg = col*s_minor_dim_registers + rowreg;
+            m_registers[reg] = element_type(0);
+          }
+        }
+      }
+      // more than one column per register
+      else
+      {
+        // default to strided operation
+        return load_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
+      }
+    }
+    // Do semi-dense load for column-major
+    else
+    {
 
-                camp::idx_t row0 = rowreg*s_elements_per_register;
-                camp::idx_t offset = col*col_stride + row0;
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
 
-                // loading a complete register
-                if(row0+s_elements_per_register <= num_rows){
-                  m_registers[reg].load_packed(ptr + offset);
-                }
+        for (camp::idx_t col = 0; col < num_cols; ++col)
+        {
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
+          {
 
-                // partial register at end of column
-                else{
-                  m_registers[reg].load_packed_n(ptr + offset, num_rows - row0);
+            camp::idx_t reg = col * s_minor_dim_registers + rowreg;
 
-                  // zero out the remaining registers, if any
-                  for(camp::idx_t i = rowreg+1;i < s_minor_dim_registers;++i){
-                    reg++;
-                    m_registers[reg] = element_type(0);
-                  }
+            camp::idx_t row0   = rowreg * s_elements_per_register;
+            camp::idx_t offset = col * col_stride + row0;
 
-                  break; // end this column
-                }
-              }
+            // loading a complete register
+            if (row0 + s_elements_per_register <= num_rows)
+            {
+              m_registers[reg].load_packed(ptr + offset);
             }
-            // zero out remaining columns
-            for(camp::idx_t col = num_cols;col < COL_SIZE;++ col){
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers; ++ rowreg){
 
-                camp::idx_t reg = col*s_minor_dim_registers + rowreg;
+            // partial register at end of column
+            else
+            {
+              m_registers[reg].load_packed_n(ptr + offset, num_rows - row0);
 
+              // zero out the remaining registers, if any
+              for (camp::idx_t i = rowreg + 1; i < s_minor_dim_registers; ++i)
+              {
+                reg++;
                 m_registers[reg] = element_type(0);
               }
-            }
 
+              break;  // end this column
+            }
           }
-          // more than one column per register
-          else{
+        }
+        // zero out remaining columns
+        for (camp::idx_t col = num_cols; col < COL_SIZE; ++col)
+        {
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
+          {
+
+            camp::idx_t reg = col * s_minor_dim_registers + rowreg;
 
-            // default to strided operation
-            return load_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
+            m_registers[reg] = element_type(0);
           }
         }
-
-        return *this;
       }
-
-      /*!
-       * Loads a strided partial matrix from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided_nm(element_type const *ptr,
-          int row_stride, int col_stride,
-          int num_rows, int num_cols)
+      // more than one column per register
+      else
       {
 
-        if(layout_type::is_row_major()){
-          // one or more registers per row
-          if(s_minor_dim_registers){
-
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(row >= num_rows){
-                m_registers[i] = element_type(0);
-              }
-              else{
-                camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
+        // default to strided operation
+        return load_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
+      }
+    }
 
+    return *this;
+  }
 
-                camp::idx_t reg_num_cols = s_elements_per_register;
-                if(reg_num_cols+col > num_cols){
-                  reg_num_cols = num_cols-col;
-                  m_registers[i].load_strided_n(ptr+row*row_stride+col*col_stride, col_stride, reg_num_cols);
-                }
-                else{
-                  m_registers[i].load_strided(ptr+row*row_stride+col*col_stride, col_stride);
-                }
+  /*!
+   * Loads a strided partial matrix from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_strided_nm(element_type const* ptr,
+                             int row_stride,
+                             int col_stride,
+                             int num_rows,
+                             int num_cols)
+  {
 
+    if (layout_type::is_row_major())
+    {
+      // one or more registers per row
+      if (s_minor_dim_registers)
+      {
 
-              }
-            }
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (row >= num_rows)
+          {
+            m_registers[i] = element_type(0);
           }
-          // less than one register per row
           else
           {
+            camp::idx_t col =
+                s_elements_per_register * (i - (row * s_minor_dim_registers));
 
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many rows get loaded in this register
-              camp::idx_t reg_num_rows = num_rows - i*s_major_dim_per_register;
-              reg_num_rows = reg_num_rows > s_major_dim_per_register ? s_major_dim_per_register : reg_num_rows;
 
-              element_type const *ptr_i = ptr + i * row_stride*s_major_dim_per_register;
-              m_registers[i].segmented_load_nm(ptr_i, s_segbits, col_stride, row_stride, num_cols, reg_num_rows);
+            camp::idx_t reg_num_cols = s_elements_per_register;
+            if (reg_num_cols + col > num_cols)
+            {
+              reg_num_cols = num_cols - col;
+              m_registers[i].load_strided_n(ptr + row * row_stride +
+                                                col * col_stride,
+                                            col_stride, reg_num_cols);
+            }
+            else
+            {
+              m_registers[i].load_strided(
+                  ptr + row * row_stride + col * col_stride, col_stride);
             }
           }
         }
+      }
+      // less than one register per row
+      else
+      {
 
-        // column major
-        else{
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          // figure out how many rows get loaded in this register
+          camp::idx_t reg_num_rows = num_rows - i * s_major_dim_per_register;
+          reg_num_rows             = reg_num_rows > s_major_dim_per_register
+                                         ? s_major_dim_per_register
+                                         : reg_num_rows;
+
+          element_type const* ptr_i =
+              ptr + i * row_stride * s_major_dim_per_register;
+          m_registers[i].segmented_load_nm(ptr_i, s_segbits, col_stride,
+                                           row_stride, num_cols, reg_num_rows);
+        }
+      }
+    }
 
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(col >= num_cols){
-                m_registers[i] = element_type(0);
-              }
-              else{
-                camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
+    // column major
+    else
+    {
 
-                camp::idx_t reg_num_rows = s_elements_per_register;
-                if(reg_num_rows+row > num_rows){
-                  reg_num_rows = num_rows-row;
-                  m_registers[i].load_strided_n(ptr+row*row_stride+col*col_stride, row_stride, reg_num_rows);
-                }
-                else{
-                  m_registers[i].load_strided(ptr+row*row_stride+col*col_stride, row_stride);
-                }
-              }
-            }
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (col >= num_cols)
+          {
+            m_registers[i] = element_type(0);
           }
-          // less than one register per column
           else
           {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many columns get loaded in this register
-              camp::idx_t reg_num_cols = num_cols - i*s_major_dim_per_register;
-              reg_num_cols = reg_num_cols > s_major_dim_per_register ? s_major_dim_per_register : reg_num_cols;
+            camp::idx_t row =
+                s_elements_per_register * (i - (col * s_minor_dim_registers));
 
-              element_type const *ptr_i = ptr + i * col_stride*s_major_dim_per_register;
-              m_registers[i].segmented_load_nm(ptr_i, s_segbits, row_stride, col_stride, num_rows, reg_num_cols);
+            camp::idx_t reg_num_rows = s_elements_per_register;
+            if (reg_num_rows + row > num_rows)
+            {
+              reg_num_rows = num_rows - row;
+              m_registers[i].load_strided_n(ptr + row * row_stride +
+                                                col * col_stride,
+                                            row_stride, reg_num_rows);
+            }
+            else
+            {
+              m_registers[i].load_strided(
+                  ptr + row * row_stride + col * col_stride, row_stride);
             }
           }
         }
-
-        return *this;
       }
+      // less than one register per column
+      else
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          // figure out how many columns get loaded in this register
+          camp::idx_t reg_num_cols = num_cols - i * s_major_dim_per_register;
+          reg_num_cols             = reg_num_cols > s_major_dim_per_register
+                                         ? s_major_dim_per_register
+                                         : reg_num_cols;
+
+          element_type const* ptr_i =
+              ptr + i * col_stride * s_major_dim_per_register;
+          m_registers[i].segmented_load_nm(ptr_i, s_segbits, row_stride,
+                                           col_stride, num_rows, reg_num_cols);
+        }
+      }
+    }
 
+    return *this;
+  }
 
 
-      /*!
-       * Store a dense full matrix to memory.
-       *
-       * Column entries must be stride-1, rows may be any striding
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr,
-          int row_stride, int col_stride) const
-      {
+  /*!
+   * Store a dense full matrix to memory.
+   *
+   * Column entries must be stride-1, rows may be any striding
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const&
+  store_packed(element_type* ptr, int row_stride, int col_stride) const
+  {
 
-        // if it's dense in columns and rows, just do a dense load
-        if((layout_type::is_row_major()&&(row_stride==COL_SIZE)) ||
-           (layout_type::is_column_major()&&(col_stride==ROW_SIZE))){
+    // if it's dense in columns and rows, just do a dense load
+    if ((layout_type::is_row_major() && (row_stride == COL_SIZE)) ||
+        (layout_type::is_column_major() && (col_stride == ROW_SIZE)))
+    {
 
-          for(camp::idx_t reg = 0;reg < s_num_registers;++ reg){
-            m_registers[reg].store_packed(ptr + reg*s_elements_per_register);
-          }
+      for (camp::idx_t reg = 0; reg < s_num_registers; ++reg)
+      {
+        m_registers[reg].store_packed(ptr + reg * s_elements_per_register);
+      }
+    }
+    // Do semi-dense store for row-major
+    else if (layout_type::is_row_major())
+    {
 
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t col =
+              s_elements_per_register * (i - (row * s_minor_dim_registers));
+          m_registers[i].store_packed(ptr + row * row_stride +
+                                      col * col_stride);
         }
-        // Do semi-dense store for row-major
-        else if(layout_type::is_row_major()){
-
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
-              m_registers[i].store_packed(ptr+row*row_stride+col*col_stride);
-            }
-          }
-          // more than one column per register
-          else{
-            store_strided(ptr, row_stride, col_stride);
-          }
-        }
-        // Do semi-dense store for column-major
-        else{
-          // one or more registers per row
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
-              m_registers[i].store_packed(ptr+row*row_stride+col*col_stride);
-            }
-          }
-          // more than one row per register
-          else{
-            store_strided(ptr, row_stride, col_stride);
-          }
+      }
+      // more than one column per register
+      else
+      {
+        store_strided(ptr, row_stride, col_stride);
+      }
+    }
+    // Do semi-dense store for column-major
+    else
+    {
+      // one or more registers per row
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t row =
+              s_elements_per_register * (i - (col * s_minor_dim_registers));
+          m_registers[i].store_packed(ptr + row * row_stride +
+                                      col * col_stride);
         }
+      }
+      // more than one row per register
+      else
+      {
+        store_strided(ptr, row_stride, col_stride);
+      }
+    }
 
 
-        return *this;
-      }
+    return *this;
+  }
 
-      /*!
-       * Store a strided full matrix to memory
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr,
-          int row_stride, int col_stride) const
-      {
+  /*!
+   * Store a strided full matrix to memory
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const&
+  store_strided(element_type* ptr, int row_stride, int col_stride) const
+  {
 
 
-        if(layout_type::is_row_major()){
-          // one or more registers per row
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
-              m_registers[i].store_strided(ptr+row*row_stride+col*col_stride, col_stride);
-            }
-          }
-          // less than one register per row
-          else
-          {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              element_type *ptr_i = ptr + i * row_stride*s_major_dim_per_register;
-              m_registers[i].segmented_store(ptr_i, s_segbits, col_stride, row_stride);
-            }
-          }
+    if (layout_type::is_row_major())
+    {
+      // one or more registers per row
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t col =
+              s_elements_per_register * (i - (row * s_minor_dim_registers));
+          m_registers[i].store_strided(
+              ptr + row * row_stride + col * col_stride, col_stride);
         }
-
-        // column major
-        else{
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
-              m_registers[i].store_strided(ptr+row*row_stride+col*col_stride, row_stride);
-            }
-          }
-          // less than one register per column
-          else
-          {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              element_type *ptr_i = ptr + i * col_stride*s_major_dim_per_register;
-              m_registers[i].segmented_store(ptr_i, s_segbits, row_stride, col_stride);
-            }
-          }
+      }
+      // less than one register per row
+      else
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          element_type* ptr_i = ptr + i * row_stride * s_major_dim_per_register;
+          m_registers[i].segmented_store(ptr_i, s_segbits, col_stride,
+                                         row_stride);
         }
-
-        return *this;
       }
+    }
 
-      /*!
-       * Store a dense partial matrix to memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed_nm(element_type *ptr,
-          int row_stride, int col_stride,
-          int num_rows, int num_cols) const
+    // column major
+    else
+    {
+      // one or more registers per column
+      if (s_minor_dim_registers)
       {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t row =
+              s_elements_per_register * (i - (col * s_minor_dim_registers));
+          m_registers[i].store_strided(
+              ptr + row * row_stride + col * col_stride, row_stride);
+        }
+      }
+      // less than one register per column
+      else
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          element_type* ptr_i = ptr + i * col_stride * s_major_dim_per_register;
+          m_registers[i].segmented_store(ptr_i, s_segbits, row_stride,
+                                         col_stride);
+        }
+      }
+    }
 
+    return *this;
+  }
 
-        if(layout_type::is_row_major()){
+  /*!
+   * Store a dense partial matrix to memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& store_packed_nm(element_type* ptr,
+                                   int row_stride,
+                                   int col_stride,
+                                   int num_rows,
+                                   int num_cols) const
+  {
 
-          // one or more registers per column
-          if(s_minor_dim_registers){
 
-            for(camp::idx_t row = 0;row < num_rows;++ row){
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers; ++ colreg){
+    if (layout_type::is_row_major())
+    {
 
-                camp::idx_t reg = row*s_minor_dim_registers + colreg;
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
 
-                camp::idx_t col0 = colreg*s_elements_per_register;
-                camp::idx_t offset = row*row_stride + col0;
+        for (camp::idx_t row = 0; row < num_rows; ++row)
+        {
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
+          {
 
-                // store a complete register
-                if(col0+s_elements_per_register <= num_cols){
-                  m_registers[reg].store_packed(ptr + offset);
-                }
+            camp::idx_t reg = row * s_minor_dim_registers + colreg;
 
-                // partial register at end of row
-                else{
-                  m_registers[reg].store_packed_n(ptr + offset, num_cols - col0);
+            camp::idx_t col0   = colreg * s_elements_per_register;
+            camp::idx_t offset = row * row_stride + col0;
 
-                  break; // end this row
-                }
-              }
+            // store a complete register
+            if (col0 + s_elements_per_register <= num_cols)
+            {
+              m_registers[reg].store_packed(ptr + offset);
             }
 
-          }
-          // more than one column per register
-          else{
-            // default to strided operation
-            return store_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
+            // partial register at end of row
+            else
+            {
+              m_registers[reg].store_packed_n(ptr + offset, num_cols - col0);
+
+              break;  // end this row
+            }
           }
         }
-        // Do semi-dense store for column-major
-        else{
-
-          // one or more registers per column
-          if(s_minor_dim_registers){
-
-            for(camp::idx_t col = 0;col < num_cols;++ col){
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers; ++ rowreg){
+      }
+      // more than one column per register
+      else
+      {
+        // default to strided operation
+        return store_strided_nm(ptr, row_stride, col_stride, num_rows,
+                                num_cols);
+      }
+    }
+    // Do semi-dense store for column-major
+    else
+    {
 
-                camp::idx_t reg = col*s_minor_dim_registers + rowreg;
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
 
-                camp::idx_t row0 = rowreg*s_elements_per_register;
-                camp::idx_t offset = col*col_stride + row0;
+        for (camp::idx_t col = 0; col < num_cols; ++col)
+        {
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
+          {
 
-                // loading a complete register
-                if(row0+s_elements_per_register <= num_rows){
-                  m_registers[reg].store_packed(ptr + offset);
-                }
+            camp::idx_t reg = col * s_minor_dim_registers + rowreg;
 
-                // partial register at end of column
-                else{
-                  m_registers[reg].store_packed_n(ptr + offset, num_rows - row0);
+            camp::idx_t row0   = rowreg * s_elements_per_register;
+            camp::idx_t offset = col * col_stride + row0;
 
-                  break; // end this column
-                }
-              }
+            // loading a complete register
+            if (row0 + s_elements_per_register <= num_rows)
+            {
+              m_registers[reg].store_packed(ptr + offset);
             }
 
-          }
-          // more than one column per register
-          else{
+            // partial register at end of column
+            else
+            {
+              m_registers[reg].store_packed_n(ptr + offset, num_rows - row0);
 
-            // default to strided operation
-            return store_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
+              break;  // end this column
+            }
           }
         }
-
-        return *this;
       }
-
-      /*!
-       * Store a strided partial matrix to memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided_nm(element_type *ptr,
-          int row_stride, int col_stride,
-          int num_rows, int num_cols) const
+      // more than one column per register
+      else
       {
 
+        // default to strided operation
+        return store_strided_nm(ptr, row_stride, col_stride, num_rows,
+                                num_cols);
+      }
+    }
 
-        if(layout_type::is_row_major()){
-          // one or more registers per row
-          if(s_minor_dim_registers){
-
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(row < num_rows){
-                camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
+    return *this;
+  }
 
+  /*!
+   * Store a strided partial matrix to memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& store_strided_nm(element_type* ptr,
+                                    int row_stride,
+                                    int col_stride,
+                                    int num_rows,
+                                    int num_cols) const
+  {
 
-                camp::idx_t reg_num_cols = s_elements_per_register;
-                if(reg_num_cols+col > num_cols){
-                  reg_num_cols = num_cols-col;
-                  m_registers[i].store_strided_n(ptr+row*row_stride+col*col_stride, col_stride, reg_num_cols);
-                }
-                else{
-                  m_registers[i].store_strided(ptr+row*row_stride+col*col_stride, col_stride);
-                }
 
+    if (layout_type::is_row_major())
+    {
+      // one or more registers per row
+      if (s_minor_dim_registers)
+      {
 
-              }
-            }
-          }
-          // less than one register per row
-          else
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (row < num_rows)
           {
+            camp::idx_t col =
+                s_elements_per_register * (i - (row * s_minor_dim_registers));
 
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many rows get loaded in this register
-              camp::idx_t reg_num_rows = num_rows - i*s_major_dim_per_register;
-              reg_num_rows = reg_num_rows > s_major_dim_per_register ? s_major_dim_per_register : reg_num_rows;
 
-              element_type *ptr_i = ptr + i * row_stride*s_major_dim_per_register;
-              m_registers[i].segmented_store_nm(ptr_i, s_segbits, col_stride, row_stride, num_cols, reg_num_rows);
+            camp::idx_t reg_num_cols = s_elements_per_register;
+            if (reg_num_cols + col > num_cols)
+            {
+              reg_num_cols = num_cols - col;
+              m_registers[i].store_strided_n(ptr + row * row_stride +
+                                                 col * col_stride,
+                                             col_stride, reg_num_cols);
+            }
+            else
+            {
+              m_registers[i].store_strided(
+                  ptr + row * row_stride + col * col_stride, col_stride);
             }
           }
         }
+      }
+      // less than one register per row
+      else
+      {
 
-        // column major
-        else{
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          // figure out how many rows get loaded in this register
+          camp::idx_t reg_num_rows = num_rows - i * s_major_dim_per_register;
+          reg_num_rows             = reg_num_rows > s_major_dim_per_register
+                                         ? s_major_dim_per_register
+                                         : reg_num_rows;
+
+          element_type* ptr_i = ptr + i * row_stride * s_major_dim_per_register;
+          m_registers[i].segmented_store_nm(ptr_i, s_segbits, col_stride,
+                                            row_stride, num_cols, reg_num_rows);
+        }
+      }
+    }
 
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(col < num_cols){
-                camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
-
-                camp::idx_t reg_num_rows = s_elements_per_register;
-                if(reg_num_rows+row > num_rows){
-                  reg_num_rows = num_rows-row;
-                  m_registers[i].store_strided_n(ptr+row*row_stride+col*col_stride, row_stride, reg_num_rows);
-                }
-                else{
-                  m_registers[i].store_strided(ptr+row*row_stride+col*col_stride, row_stride);
-                }
-              }
-            }
-          }
-          // less than one register per column
-          else
+    // column major
+    else
+    {
+
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (col < num_cols)
           {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many columns get loaded in this register
-              camp::idx_t reg_num_cols = num_cols - i*s_major_dim_per_register;
-              reg_num_cols = reg_num_cols > s_major_dim_per_register ? s_major_dim_per_register : reg_num_cols;
+            camp::idx_t row =
+                s_elements_per_register * (i - (col * s_minor_dim_registers));
 
-              element_type *ptr_i = ptr + i * col_stride*s_major_dim_per_register;
-              m_registers[i].segmented_store_nm(ptr_i, s_segbits, row_stride, col_stride, num_rows, reg_num_cols);
+            camp::idx_t reg_num_rows = s_elements_per_register;
+            if (reg_num_rows + row > num_rows)
+            {
+              reg_num_rows = num_rows - row;
+              m_registers[i].store_strided_n(ptr + row * row_stride +
+                                                 col * col_stride,
+                                             row_stride, reg_num_rows);
+            }
+            else
+            {
+              m_registers[i].store_strided(
+                  ptr + row * row_stride + col * col_stride, row_stride);
             }
           }
         }
-
-        return *this;
       }
+      // less than one register per column
+      else
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          // figure out how many columns get loaded in this register
+          camp::idx_t reg_num_cols = num_cols - i * s_major_dim_per_register;
+          reg_num_cols             = reg_num_cols > s_major_dim_per_register
+                                         ? s_major_dim_per_register
+                                         : reg_num_cols;
+
+          element_type* ptr_i = ptr + i * col_stride * s_major_dim_per_register;
+          m_registers[i].segmented_store_nm(ptr_i, s_segbits, row_stride,
+                                            col_stride, num_rows, reg_num_cols);
+        }
+      }
+    }
 
-
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_nm(self_type const &mat, int num_rows, int num_cols) const {
-        self_type result;
+    return *this;
+  }
 
 
-        if(layout_type::is_row_major()){
-          // one or more registers per row
-          if(s_minor_dim_registers){
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_nm(self_type const& mat, int num_rows, int num_cols) const
+  {
+    self_type result;
 
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(row < num_rows){
-                camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
 
+    if (layout_type::is_row_major())
+    {
+      // one or more registers per row
+      if (s_minor_dim_registers)
+      {
 
-                camp::idx_t reg_num_cols = s_elements_per_register;
-                if(reg_num_cols+col > num_cols){
-                  reg_num_cols = num_cols-col;
-                  result.m_registers[i] = m_registers[i].divide_n(mat.m_registers[i], reg_num_cols);
-                }
-                else{
-                  result.m_registers[i] = m_registers[i].divide(mat.m_registers[i]);
-                }
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (row < num_rows)
+          {
+            camp::idx_t col =
+                s_elements_per_register * (i - (row * s_minor_dim_registers));
 
 
-              }
+            camp::idx_t reg_num_cols = s_elements_per_register;
+            if (reg_num_cols + col > num_cols)
+            {
+              reg_num_cols = num_cols - col;
+              result.m_registers[i] =
+                  m_registers[i].divide_n(mat.m_registers[i], reg_num_cols);
+            }
+            else
+            {
+              result.m_registers[i] = m_registers[i].divide(mat.m_registers[i]);
             }
           }
-          // less than one register per row
-          else
-          {
+        }
+      }
+      // less than one register per row
+      else
+      {
 
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many rows get loaded in this register
-              camp::idx_t reg_num_rows = num_rows - i*s_major_dim_per_register;
-              reg_num_rows = reg_num_rows > s_major_dim_per_register ? s_major_dim_per_register : reg_num_rows;
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          // figure out how many rows get loaded in this register
+          camp::idx_t reg_num_rows = num_rows - i * s_major_dim_per_register;
+          reg_num_rows             = reg_num_rows > s_major_dim_per_register
+                                         ? s_major_dim_per_register
+                                         : reg_num_rows;
 
-              result.m_registers[i] = m_registers[i].segmented_divide_nm(mat.m_registers[i], s_segbits, num_cols, reg_num_rows);
-            }
-          }
+          result.m_registers[i] = m_registers[i].segmented_divide_nm(
+              mat.m_registers[i], s_segbits, num_cols, reg_num_rows);
         }
+      }
+    }
 
-        // column major
-        else{
+    // column major
+    else
+    {
 
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(col < num_cols){
-                camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
-
-                camp::idx_t reg_num_rows = s_elements_per_register;
-                if(reg_num_rows+row > num_rows){
-                  reg_num_rows = num_rows-row;
-                  result.m_registers[i] = m_registers[i].divide_n(mat.m_registers[i], reg_num_rows);
-                }
-                else{
-                  result.m_registers[i] = m_registers[i].divide(mat.m_registers[i]);
-                }
-              }
-            }
-          }
-          // less than one register per column
-          else
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (col < num_cols)
           {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many columns get loaded in this register
-              camp::idx_t reg_num_cols = num_cols - i*s_major_dim_per_register;
-              reg_num_cols = reg_num_cols > s_major_dim_per_register ? s_major_dim_per_register : reg_num_cols;
+            camp::idx_t row =
+                s_elements_per_register * (i - (col * s_minor_dim_registers));
 
-              result.m_registers[i] = m_registers[i].segmented_divide_nm(mat.m_registers[i], s_segbits, num_rows, reg_num_cols);
+            camp::idx_t reg_num_rows = s_elements_per_register;
+            if (reg_num_rows + row > num_rows)
+            {
+              reg_num_rows = num_rows - row;
+              result.m_registers[i] =
+                  m_registers[i].divide_n(mat.m_registers[i], reg_num_rows);
+            }
+            else
+            {
+              result.m_registers[i] = m_registers[i].divide(mat.m_registers[i]);
             }
           }
         }
-
-
-        return result;
       }
+      // less than one register per column
+      else
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          // figure out how many columns get loaded in this register
+          camp::idx_t reg_num_cols = num_cols - i * s_major_dim_per_register;
+          reg_num_cols             = reg_num_cols > s_major_dim_per_register
+                                         ? s_major_dim_per_register
+                                         : reg_num_cols;
+
+          result.m_registers[i] = m_registers[i].segmented_divide_nm(
+              mat.m_registers[i], s_segbits, num_rows, reg_num_cols);
+        }
+      }
+    }
 
 
+    return result;
+  }
 
-      /*!
-       * Matrix transpose, keeping layout
-       *
-       * Transpose is not completely implemented
-       */
+
+  /*!
+   * Matrix transpose, keeping layout
+   *
+   * Transpose is not completely implemented
+   */
 #if 0
       RAJA_HOST_DEVICE
       RAJA_INLINE
@@ -1291,386 +1512,427 @@ namespace expt
         return reinterpret_cast<transpose_tensor_type const &>(*this);
       }
 #endif
-      /*!
-       * Matrix vector product
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      column_vector_type right_multiply_vector(row_vector_type v) const {
-        column_vector_type result(0);
-        return right_multiply_vector_accumulate(v, result);
-      }
-
-      /*!
-       * Matrix vector product
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      row_vector_type left_multiply_vector(column_vector_type v) const {
-        row_vector_type result(0);
-        return left_multiply_vector_accumulate(v, result);
-      }
-
-
-      /*!
-       * Matrix vector product with accumulation into another vector
-       *
-       * acc += (this) * v
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      column_vector_type right_multiply_vector_accumulate(row_vector_type const &v, column_vector_type result) const {
-
-        if(layout_type::is_row_major()){
-
-          // 1 register is split over multiple rows
-          if(s_minor_dim_registers == 0){
-
-            // start by broadcasting the first segment in v across all of v
-            // we will use this term for all registers in the matrix
-            auto vv = v.get_register(0).segmented_broadcast_inner(s_segbits, 0);
-
-            // loop over output segments, which is also the number of
-            // registers in the matrix (no kidding!)
-            RAJA_UNROLL
-            for(camp::idx_t outseg = 0;outseg < s_num_registers;++ outseg){
-
-              // compute which result register we are accumulating into
-              camp::idx_t result_reg = outseg >> s_segbits;
+  /*!
+   * Matrix vector product
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  column_vector_type right_multiply_vector(row_vector_type v) const
+  {
+    column_vector_type result(0);
+    return right_multiply_vector_accumulate(v, result);
+  }
 
-              // compute which segment within result_reg we are accumulating into
-              camp::idx_t result_seg = outseg - (result_reg<<s_segbits);
+  /*!
+   * Matrix vector product
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  row_vector_type left_multiply_vector(column_vector_type v) const
+  {
+    row_vector_type result(0);
+    return left_multiply_vector_accumulate(v, result);
+  }
 
-              // compute segmented dot product to get output segment
-              auto value = m_registers[outseg].segmented_dot(s_segbits, result_seg, vv);
 
-              // accumulate result
-              result.get_register(result_reg) += value;
-            }
+  /*!
+   * Matrix vector product with accumulation into another vector
+   *
+   * acc += (this) * v
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  column_vector_type
+  right_multiply_vector_accumulate(row_vector_type const& v,
+                                   column_vector_type result) const
+  {
 
-          }
-          // one or more registers per row
-          else{
+    if (layout_type::is_row_major())
+    {
 
-            // Loop over rows
-            camp::idx_t reg = 0;
-            RAJA_UNROLL
-            for(camp::idx_t row = 0;row < s_num_rows;++ row){
+      // 1 register is split over multiple rows
+      if (s_minor_dim_registers == 0)
+      {
 
-              // compute partial dot products for all registers in this row
-              auto rowsum = register_type(0);
-              RAJA_UNROLL
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers;++ colreg){
+        // start by broadcasting the first segment in v across all of v
+        // we will use this term for all registers in the matrix
+        auto vv = v.get_register(0).segmented_broadcast_inner(s_segbits, 0);
 
-                rowsum = m_registers[reg].multiply_add(v.get_register(colreg), rowsum);
-                reg ++;
+        // loop over output segments, which is also the number of
+        // registers in the matrix (no kidding!)
+        RAJA_UNROLL
+        for (camp::idx_t outseg = 0; outseg < s_num_registers; ++outseg)
+        {
 
-              } // rowreg
+          // compute which result register we are accumulating into
+          camp::idx_t result_reg = outseg >> s_segbits;
 
-              // finish dot product by taking sum of rowsum
-              auto value = result.get(row) + rowsum.sum();
-              result.set(value, row);
+          // compute which segment within result_reg we are accumulating into
+          camp::idx_t result_seg = outseg - (result_reg << s_segbits);
 
-            } // row
-          }
+          // compute segmented dot product to get output segment
+          auto value =
+              m_registers[outseg].segmented_dot(s_segbits, result_seg, vv);
 
+          // accumulate result
+          result.get_register(result_reg) += value;
         }
-        else{
-
-
-          // 1 register is split over multiple columns
-          if(s_minor_dim_registers == 0){
-
-            auto &mv = result.get_register(0);
-
-            // Loop over registers, which are also the segments in v
-            RAJA_UNROLL
-            for(camp::idx_t m_reg = 0;m_reg < s_num_registers;++ m_reg){
-              camp::idx_t v_reg = m_reg >> s_segbits;
-              camp::idx_t v_seg = m_reg & ( (1<<s_segbits) - 1);
+      }
+      // one or more registers per row
+      else
+      {
 
-              auto v_tmp = v.get_register(v_reg).segmented_broadcast_outer(s_segbits, v_seg);
-              mv = m_registers[m_reg].multiply_add(v_tmp, mv);
+        // Loop over rows
+        camp::idx_t reg = 0;
+        RAJA_UNROLL
+        for (camp::idx_t row = 0; row < s_num_rows; ++row)
+        {
 
-            }
+          // compute partial dot products for all registers in this row
+          auto rowsum = register_type(0);
+          RAJA_UNROLL
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
+          {
 
-            // Now sum segments in mv together to form final result
-            mv = mv.segmented_sum_outer(s_segbits, 0);
+            rowsum =
+                m_registers[reg].multiply_add(v.get_register(colreg), rowsum);
+            reg++;
 
-          }
-          // one or more registers per column
-          else{
+          }  // rowreg
 
-            // Loop over columns (which is also registers)
-            camp::idx_t reg = 0;
-            RAJA_UNROLL
-            for(camp::idx_t col = 0;col < s_num_columns;++ col){
+          // finish dot product by taking sum of rowsum
+          auto value = result.get(row) + rowsum.sum();
+          result.set(value, row);
 
-              // extract column value from v
-              auto v_col = register_type(v.get(col));
+        }  // row
+      }
+    }
+    else
+    {
 
-              // apply v_col to entire column (1 or more registers)
-              RAJA_UNROLL
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers;++ rowreg){
 
-                auto &mv = result.get_register(rowreg);
-                mv = m_registers[reg].multiply_add(v_col, mv);
+      // 1 register is split over multiple columns
+      if (s_minor_dim_registers == 0)
+      {
 
-                reg ++;
+        auto& mv = result.get_register(0);
 
-              } // rowreg
-            } // col
-          }
+        // Loop over registers, which are also the segments in v
+        RAJA_UNROLL
+        for (camp::idx_t m_reg = 0; m_reg < s_num_registers; ++m_reg)
+        {
+          camp::idx_t v_reg = m_reg >> s_segbits;
+          camp::idx_t v_seg = m_reg & ((1 << s_segbits) - 1);
 
+          auto v_tmp =
+              v.get_register(v_reg).segmented_broadcast_outer(s_segbits, v_seg);
+          mv = m_registers[m_reg].multiply_add(v_tmp, mv);
         }
-        return result;
-      }
-
-      /*!
-       * Matrix vector product with accumulation into another vector
-       *
-       * acc += v * (this)
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      row_vector_type left_multiply_vector_accumulate(column_vector_type const &v, row_vector_type result) const {
 
-        if(layout_type::is_row_major()){
-
-          // 1 register is split over multiple columns
-          if(s_minor_dim_registers == 0){
-            auto &vm = result.get_register(0);
-
-            // Loop over registers, which are also the segments in v
-            RAJA_UNROLL
-            for(camp::idx_t m_reg = 0;m_reg < s_num_registers;++ m_reg){
-              camp::idx_t v_reg = m_reg >> s_segbits;
-              camp::idx_t v_seg = m_reg & ( (1<<s_segbits) - 1);
+        // Now sum segments in mv together to form final result
+        mv = mv.segmented_sum_outer(s_segbits, 0);
+      }
+      // one or more registers per column
+      else
+      {
 
-              auto v_tmp = v.get_register(v_reg).segmented_broadcast_outer(s_segbits, v_seg);
-              vm = m_registers[m_reg].multiply_add(v_tmp, vm);
+        // Loop over columns (which is also registers)
+        camp::idx_t reg = 0;
+        RAJA_UNROLL
+        for (camp::idx_t col = 0; col < s_num_columns; ++col)
+        {
 
-            }
+          // extract column value from v
+          auto v_col = register_type(v.get(col));
 
-            // Now sum segments in mv together to form final result
-            vm = vm.segmented_sum_outer(s_segbits, 0);
+          // apply v_col to entire column (1 or more registers)
+          RAJA_UNROLL
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
+          {
 
-          }
-          // one or more registers per row
-          else{
+            auto& mv = result.get_register(rowreg);
+            mv       = m_registers[reg].multiply_add(v_col, mv);
 
-            // Loop over rows
-            camp::idx_t reg = 0;
-            RAJA_UNROLL
-            for(camp::idx_t row = 0;row < s_num_rows;++ row){
-              auto lhs_bcat = register_type(v.get(row));
-              RAJA_UNROLL
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers;++ colreg){
+            reg++;
 
-                result.get_register(colreg) =
-                    m_registers[reg].multiply_add(lhs_bcat, result.get_register(colreg));
-                reg ++;
+          }  // rowreg
+        }    // col
+      }
+    }
+    return result;
+  }
+
+  /*!
+   * Matrix vector product with accumulation into another vector
+   *
+   * acc += v * (this)
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  row_vector_type left_multiply_vector_accumulate(column_vector_type const& v,
+                                                  row_vector_type result) const
+  {
 
-              } // rowreg
+    if (layout_type::is_row_major())
+    {
 
-            }
+      // 1 register is split over multiple columns
+      if (s_minor_dim_registers == 0)
+      {
+        auto& vm = result.get_register(0);
+
+        // Loop over registers, which are also the segments in v
+        RAJA_UNROLL
+        for (camp::idx_t m_reg = 0; m_reg < s_num_registers; ++m_reg)
+        {
+          camp::idx_t v_reg = m_reg >> s_segbits;
+          camp::idx_t v_seg = m_reg & ((1 << s_segbits) - 1);
+
+          auto v_tmp =
+              v.get_register(v_reg).segmented_broadcast_outer(s_segbits, v_seg);
+          vm = m_registers[m_reg].multiply_add(v_tmp, vm);
+        }
 
-          }
+        // Now sum segments in mv together to form final result
+        vm = vm.segmented_sum_outer(s_segbits, 0);
+      }
+      // one or more registers per row
+      else
+      {
 
+        // Loop over rows
+        camp::idx_t reg = 0;
+        RAJA_UNROLL
+        for (camp::idx_t row = 0; row < s_num_rows; ++row)
+        {
+          auto lhs_bcat = register_type(v.get(row));
+          RAJA_UNROLL
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
+          {
 
-        } // row-major
+            result.get_register(colreg) = m_registers[reg].multiply_add(
+                lhs_bcat, result.get_register(colreg));
+            reg++;
 
-        // Column-major:
-        else{
-          // 1 register is split over multiple rows
-          if(s_minor_dim_registers == 0){
+          }  // rowreg
+        }
+      }
 
-            // start by broadcasting the first segment in v across all of v
-            // we will use this term for all registers in the matrix
-            auto vv = v.get_register(0).segmented_broadcast_inner(s_segbits, 0);
 
-            // loop over output segments, which is also the number of
-            // registers in the matrix (no kidding!)
-            RAJA_UNROLL
-            for(camp::idx_t outseg = 0;outseg < s_num_registers;++ outseg){
+    }  // row-major
 
-              // compute which result register we are accumulating into
-              camp::idx_t result_reg = outseg >> s_segbits;
+    // Column-major:
+    else
+    {
+      // 1 register is split over multiple rows
+      if (s_minor_dim_registers == 0)
+      {
 
-              // compute which segment within result_reg we are accumulating into
-              camp::idx_t result_seg = outseg - (result_reg<<s_segbits);
+        // start by broadcasting the first segment in v across all of v
+        // we will use this term for all registers in the matrix
+        auto vv = v.get_register(0).segmented_broadcast_inner(s_segbits, 0);
 
-              // compute segmented dot product to get output segment
-              auto value = m_registers[outseg].segmented_dot(s_segbits, result_seg, vv);
+        // loop over output segments, which is also the number of
+        // registers in the matrix (no kidding!)
+        RAJA_UNROLL
+        for (camp::idx_t outseg = 0; outseg < s_num_registers; ++outseg)
+        {
 
-              // accumulate result
-              result.get_register(result_reg) += value;
-            }
+          // compute which result register we are accumulating into
+          camp::idx_t result_reg = outseg >> s_segbits;
 
-          }
-          // one or more registers per column
-          else{
-            // Loop over rows
-            camp::idx_t reg = 0;
-            RAJA_UNROLL
-            for(camp::idx_t col = 0;col < s_num_columns;++ col){
-
-              // compute partial dot products for all registers in this row
-              auto colsum = register_type(0);
-              RAJA_UNROLL
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers;++ rowreg){
-                colsum = m_registers[reg].multiply_add(v.get_register(rowreg), colsum);
-                reg ++;
-
-              } // rowreg
-
-              // finish dot product by taking sum of rowsum
-              auto value = result.get(col) + colsum.sum();
-              result.set(value, col);
-
-            } // col
-          }
+          // compute which segment within result_reg we are accumulating into
+          camp::idx_t result_seg = outseg - (result_reg << s_segbits);
 
+          // compute segmented dot product to get output segment
+          auto value =
+              m_registers[outseg].segmented_dot(s_segbits, result_seg, vv);
 
-        } // col-major
-        return result;
+          // accumulate result
+          result.get_register(result_reg) += value;
+        }
       }
+      // one or more registers per column
+      else
+      {
+        // Loop over rows
+        camp::idx_t reg = 0;
+        RAJA_UNROLL
+        for (camp::idx_t col = 0; col < s_num_columns; ++col)
+        {
+
+          // compute partial dot products for all registers in this row
+          auto colsum = register_type(0);
+          RAJA_UNROLL
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
+          {
+            colsum =
+                m_registers[reg].multiply_add(v.get_register(rowreg), colsum);
+            reg++;
 
+          }  // rowreg
 
+          // finish dot product by taking sum of rowsum
+          auto value = result.get(col) + colsum.sum();
+          result.set(value, col);
 
-
-
-      /*!
-       * Matrix-Matrix product
-       */
-      template<typename RMAT>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type
-      matrix_multiply(RMAT const &mat) const {
-        typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type res(0);
-        RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type,RMAT>::multiply(*this, mat, res);
-        return res;
+        }  // col
       }
 
-      /*!
-       * Matrix-Matrix multiply add
-       */
-      template<typename RMAT>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type
-      matrix_multiply_add(RMAT const &B, typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type const &C) const {
-        typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type res(C);
-        RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type,RMAT>::multiply_accumulate(*this, B, res);
-        return res;
-      }
 
-      /*!
-       * Matrix-Matrix multiply accumulate
-       */
-      template<typename ACCMAT, typename RMAT>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      void
-      matrix_multiply_accumulate(ACCMAT &acc, RMAT const &B) const {
-        RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type,RMAT>::multiply_accumulate(*this, B, acc);
-      }
+    }  // col-major
+    return result;
+  }
 
 
+  /*!
+   * Matrix-Matrix product
+   */
+  template <typename RMAT>
+  RAJA_HOST_DEVICE RAJA_INLINE typename RAJA::internal::expt::
+      MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type
+      matrix_multiply(RMAT const& mat) const
+  {
+    typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<
+        self_type, RMAT>::result_type res(0);
+    RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::multiply(
+        *this, mat, res);
+    return res;
+  }
+
+  /*!
+   * Matrix-Matrix multiply add
+   */
+  template <typename RMAT>
+  RAJA_HOST_DEVICE RAJA_INLINE typename RAJA::internal::expt::
+      MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type
+      matrix_multiply_add(
+          RMAT const& B,
+          typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<
+              self_type,
+              RMAT>::result_type const& C) const
+  {
+    typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<
+        self_type, RMAT>::result_type res(C);
+    RAJA::internal::expt::MatrixMatrixMultiplyHelper<
+        self_type, RMAT>::multiply_accumulate(*this, B, res);
+    return res;
+  }
+
+  /*!
+   * Matrix-Matrix multiply accumulate
+   */
+  template <typename ACCMAT, typename RMAT>
+  RAJA_HOST_DEVICE RAJA_INLINE void
+  matrix_multiply_accumulate(ACCMAT& acc, RMAT const& B) const
+  {
+    RAJA::internal::expt::MatrixMatrixMultiplyHelper<
+        self_type, RMAT>::multiply_accumulate(*this, B, acc);
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &set(element_type val, int row, int col){
-        m_registers[to_register(row, col)].set(val, to_lane(row,col));
-        return *this;
-      }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type get(int row, int col) const {
-        return m_registers[to_register(row, col)].get(to_lane(row,col));
-      }
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& set(element_type val, int row, int col)
+  {
+    m_registers[to_register(row, col)].set(val, to_lane(row, col));
+    return *this;
+  }
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  element_type get(int row, int col) const
+  {
+    return m_registers[to_register(row, col)].get(to_lane(row, col));
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      register_type extract_diagonal_register(camp::idx_t starting_column, camp::idx_t segbits, camp::idx_t segment) const {
 
-        register_type result(0);
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  register_type extract_diagonal_register(camp::idx_t starting_column,
+                                          camp::idx_t segbits,
+                                          camp::idx_t segment) const
+  {
 
-        camp::idx_t num_rows = register_type::s_num_elem >> segbits;
-        camp::idx_t num_repeats = 1 << segbits;
+    register_type result(0);
 
-        camp::idx_t col0 = (starting_column + num_rows*segment)%s_num_columns;
-        camp::idx_t row0 = num_rows*segment;
+    camp::idx_t num_rows    = register_type::s_num_elem >> segbits;
+    camp::idx_t num_repeats = 1 << segbits;
 
-        for(camp::idx_t i = 0;i < num_rows;++i){
-          camp::idx_t col = (col0 + i) % s_num_columns;
-          camp::idx_t row = row0 + i;
-          auto value = get(row,col);
-          for(camp::idx_t j = 0;j < num_repeats;++j){
-            result.set(value, (i<<segbits) + j);
-          }
-        }
+    camp::idx_t col0 = (starting_column + num_rows * segment) % s_num_columns;
+    camp::idx_t row0 = num_rows * segment;
 
-        return result;
+    for (camp::idx_t i = 0; i < num_rows; ++i)
+    {
+      camp::idx_t col = (col0 + i) % s_num_columns;
+      camp::idx_t row = row0 + i;
+      auto value      = get(row, col);
+      for (camp::idx_t j = 0; j < num_repeats; ++j)
+      {
+        result.set(value, (i << segbits) + j);
       }
+    }
 
+    return result;
+  }
 
-      /*!
-       * @brief Converts to matrix to a string
-       *
-       *
-       */
-      RAJA_INLINE
-      std::string to_string(bool one_line=false) const {
-        std::string s = "Matrix(" + std::to_string(s_num_rows) +
-            "x" + std::to_string(s_num_columns);
-        if(!one_line){
-          s +=")\n";
-        }
 
+  /*!
+   * @brief Converts to matrix to a string
+   *
+   *
+   */
+  RAJA_INLINE
+  std::string to_string(bool one_line = false) const
+  {
+    std::string s = "Matrix(" + std::to_string(s_num_rows) + "x" +
+                    std::to_string(s_num_columns);
+    if (!one_line)
+    {
+      s += ")\n";
+    }
 
-        s += "[ ";
 
-        //
-        for(camp::idx_t r = 0;r < s_num_rows; ++ r){
-          if(r > 0){
-            s += ", ";
-            if(!one_line){
-              s+= "\n  ";
-            }
-          }
-          s += "[";
-          for(camp::idx_t c = 0;c < s_num_columns; ++ c){
-            if(c > 0){
-              s += ", ";
-            }
-            s += std::to_string(this->get(r,c));
-          }
-          s += "]";
-        }
+    s += "[ ";
 
-        s += " ]";
-        if(!one_line){
-          s+="\n";
+    //
+    for (camp::idx_t r = 0; r < s_num_rows; ++r)
+    {
+      if (r > 0)
+      {
+        s += ", ";
+        if (!one_line)
+        {
+          s += "\n  ";
         }
-        return s;
       }
+      s += "[";
+      for (camp::idx_t c = 0; c < s_num_columns; ++c)
+      {
+        if (c > 0)
+        {
+          s += ", ";
+        }
+        s += std::to_string(this->get(r, c));
+      }
+      s += "]";
+    }
 
-  }; // MatrixRegisterImpl
-
-
+    s += " ]";
+    if (!one_line)
+    {
+      s += "\n";
+    }
+    return s;
+  }
 
+};  // MatrixRegisterImpl
 
 
-} // namespace expt
+}  // namespace expt
 }  // namespace RAJA
 
 
-
-
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/RegisterBase.hpp b/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
index 3480fda10c..af2ca27b98 100644
--- a/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
@@ -33,1184 +33,1197 @@ namespace RAJA
 {
 namespace expt
 {
-  template<typename T, typename REGISTER_POLICY>
-  class Register;
+template <typename T, typename REGISTER_POLICY>
+class Register;
 }
 
 namespace internal
 {
 namespace expt
 {
-  class RegisterConcreteBase {};
+class RegisterConcreteBase
+{};
 
 
-  /*
-   * Overload for:    arithmetic + TensorRegister
+/*
+ * Overload for:    arithmetic + TensorRegister
 
-   */
-  template<typename LEFT, typename RIGHT,
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator+(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return RIGHT(lhs).add(rhs);
-  }
+    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
+                            bool>::type                                  = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator+(LEFT const& lhs, RIGHT const& rhs)
+{
+  return RIGHT(lhs).add(rhs);
+}
 
-  /*
-   * Overload for:    arithmetic - TensorRegister
+/*
+ * Overload for:    arithmetic - TensorRegister
 
-   */
-  template<typename LEFT, typename RIGHT,
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator-(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return RIGHT(lhs).subtract(rhs);
-  }
+    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
+                            bool>::type                                  = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator-(LEFT const& lhs, RIGHT const& rhs)
+{
+  return RIGHT(lhs).subtract(rhs);
+}
 
-  /*
-   * Overload for:    arithmetic * TensorRegister
+/*
+ * Overload for:    arithmetic * TensorRegister
 
-   */
-  template<typename LEFT, typename RIGHT,
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator*(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return rhs.scale(lhs);
-  }
+    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
+                            bool>::type                                  = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator*(LEFT const& lhs, RIGHT const& rhs)
+{
+  return rhs.scale(lhs);
+}
 
-  /*
-   * Overload for:    arithmetic / TensorRegister
+/*
+ * Overload for:    arithmetic / TensorRegister
 
-   */
-  template<typename LEFT, typename RIGHT,
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator/(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return RIGHT(lhs).divide(rhs);
-  }
-
-
-
-
-
-  /*!
-   * Register base class that provides some default behaviors and simplifies
-   * the implementation of new register types.
-   *
-   * This uses CRTP to provide static polymorphism
-   */
-  template<typename Derived>
-  class RegisterBase;
-
-  template<typename T, typename REGISTER_POLICY>
-  class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>> :
-    public RegisterConcreteBase
-  {
-    public:
-      using self_type = RAJA::expt::Register<T, REGISTER_POLICY>;
-      using element_type = camp::decay<T>;
-
-      using index_type = camp::idx_t;
+    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
+                            bool>::type                                  = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator/(LEFT const& lhs, RIGHT const& rhs)
+{
+  return RIGHT(lhs).divide(rhs);
+}
 
-      using int_element_type = typename RegisterTraits<REGISTER_POLICY, T>::int_element_type;
-      using int_vector_type = RAJA::expt::Register<int_element_type, REGISTER_POLICY>;
 
-    private:
+/*!
+ * Register base class that provides some default behaviors and simplifies
+ * the implementation of new register types.
+ *
+ * This uses CRTP to provide static polymorphism
+ */
+template <typename Derived>
+class RegisterBase;
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type *getThis(){
-        return static_cast<self_type *>(this);
-      }
+template <typename T, typename REGISTER_POLICY>
+class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
+    : public RegisterConcreteBase
+{
+public:
+  using self_type    = RAJA::expt::Register<T, REGISTER_POLICY>;
+  using element_type = camp::decay<T>;
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      self_type const *getThis() const{
-        return static_cast<self_type const *>(this);
-      }
+  using index_type = camp::idx_t;
 
-    public:
+  using int_element_type =
+      typename RegisterTraits<REGISTER_POLICY, T>::int_element_type;
+  using int_vector_type =
+      RAJA::expt::Register<int_element_type, REGISTER_POLICY>;
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_root() {
-        return true;
-      }
+private:
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type* getThis() { return static_cast<self_type*>(this); }
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr self_type const* getThis() const
+  {
+    return static_cast<self_type const*>(this);
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      RegisterBase(){}
+public:
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr bool is_root() { return true; }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      ~RegisterBase(){}
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr RegisterBase() {}
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  ~RegisterBase() {}
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      RegisterBase(RegisterBase const &){}
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      RegisterBase(self_type const &){
-      }
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr RegisterBase(RegisterBase const&) {}
 
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr RegisterBase(self_type const&) {}
 
 
-      /*!
-       * @brief Broadcast scalar value to first N register elements
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      self_type s_broadcast_n(element_type const &value, camp::idx_t N){
-        self_type x;
-        for(camp::idx_t i = 0;i < N;++ i){
-          x.set(value, i);
-        }
-        return x;
-      }
+  /*!
+   * @brief Broadcast scalar value to first N register elements
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static self_type s_broadcast_n(element_type const& value, camp::idx_t N)
+  {
+    self_type x;
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      x.set(value, i);
+    }
+    return x;
+  }
 
-      /*!
-       * @brief Extracts a scalar value and broadcasts to a new register
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type get_and_broadcast(int i) const {
-        self_type x;
-        x.broadcast(getThis()->get(i));
-        return x;
-      }
+  /*!
+   * @brief Extracts a scalar value and broadcasts to a new register
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type get_and_broadcast(int i) const
+  {
+    self_type x;
+    x.broadcast(getThis()->get(i));
+    return x;
+  }
 
 
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &gather(element_type const *ptr, RAJA::expt::Register<T2, REGISTER_POLICY> offsets){
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type&
+  gather(element_type const* ptr,
+         RAJA::expt::Register<T2, REGISTER_POLICY> offsets)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        for(camp::idx_t i = 0;i < self_type::s_num_elem;++ i){
-          getThis()->set(ptr[offsets.get(i)], i);
-        }
-        return *getThis();
-      }
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
+      getThis()->set(ptr[offsets.get(i)], i);
+    }
+    return *getThis();
+  }
 
 
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &gather_n(element_type const *ptr, RAJA::expt::Register<T2, REGISTER_POLICY> const &offsets, camp::idx_t N){
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type&
+  gather_n(element_type const* ptr,
+           RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets,
+           camp::idx_t N)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-          for(camp::idx_t i = 0;i < N;++ i){
-            getThis()->set(ptr[offsets.get(i)], i);
-          }
-          return *getThis();
-      }
-
-
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays.
-       *
-       * The default operation combines the s_segmented_offsets and gather
-       * operations.
-       *
-       *
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load(element_type const *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer){
-        getThis()->gather(ptr, self_type::s_segmented_offsets(segbits, stride_inner, stride_outer));
-        return *getThis();
-      }
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      getThis()->set(ptr[offsets.get(i)], i);
+    }
+    return *getThis();
+  }
 
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays where we load partial segments.
-       *
-       *
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load_nm(element_type const *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer)
-      {
 
-        camp::idx_t num_segments = self_type::s_num_elem >> segbits;
-        camp::idx_t seg_size = 1 << segbits;
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays.
+   *
+   * The default operation combines the s_segmented_offsets and gather
+   * operations.
+   *
+   *
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& segmented_load(element_type const* ptr,
+                            camp::idx_t segbits,
+                            camp::idx_t stride_inner,
+                            camp::idx_t stride_outer)
+  {
+    getThis()->gather(ptr, self_type::s_segmented_offsets(segbits, stride_inner,
+                                                          stride_outer));
+    return *getThis();
+  }
 
-        camp::idx_t lane = 0;
-        for(camp::idx_t seg = 0;seg < num_segments; ++ seg){
-          for(camp::idx_t i = 0;i < seg_size; ++ i){
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays where we load partial segments.
+   *
+   *
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& segmented_load_nm(element_type const* ptr,
+                               camp::idx_t segbits,
+                               camp::idx_t stride_inner,
+                               camp::idx_t stride_outer,
+                               camp::idx_t num_inner,
+                               camp::idx_t num_outer)
+  {
 
-            if(seg >= num_outer || i >= num_inner){
-              getThis()->set(element_type(0), lane);
-            }
-            else{
+    camp::idx_t num_segments = self_type::s_num_elem >> segbits;
+    camp::idx_t seg_size     = 1 << segbits;
 
-              camp::idx_t offset = seg*stride_outer + i*stride_inner;
+    camp::idx_t lane = 0;
+    for (camp::idx_t seg = 0; seg < num_segments; ++seg)
+    {
+      for (camp::idx_t i = 0; i < seg_size; ++i)
+      {
 
-              element_type value = ptr[offset];
+        if (seg >= num_outer || i >= num_inner)
+        {
+          getThis()->set(element_type(0), lane);
+        }
+        else
+        {
 
-              getThis()->set(value, lane);
+          camp::idx_t offset = seg * stride_outer + i * stride_inner;
 
-            }
+          element_type value = ptr[offset];
 
-            lane ++;
-          }
+          getThis()->set(value, lane);
         }
 
-        return *getThis();
+        lane++;
       }
+    }
 
+    return *getThis();
+  }
 
 
-
-
-      /*!
-       * @brief Generic scatter operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &scatter(element_type *ptr, RAJA::expt::Register<T2, REGISTER_POLICY> const &offsets) const {
+  /*!
+   * @brief Generic scatter operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type const&
+  scatter(element_type* ptr,
+          RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        for(camp::idx_t i = 0;i < self_type::s_num_elem;++ i){
-          ptr[offsets.get(i)] = getThis()->get(i);
-        }
-        return *getThis();
-      }
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
+      ptr[offsets.get(i)] = getThis()->get(i);
+    }
+    return *getThis();
+  }
 
-      /*!
-       * @brief Generic scatter operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &scatter_n(element_type *ptr, RAJA::expt::Register<T2, REGISTER_POLICY> const &offsets, camp::idx_t N) const {
+  /*!
+   * @brief Generic scatter operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type const&
+  scatter_n(element_type* ptr,
+            RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets,
+            camp::idx_t N) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[offsets.get(i)] = getThis()->get(i);
-        }
-        return *getThis();
-      }
-
-
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays.
-       *
-       * The default operation combines the s_segmented_offsets and gather
-       * operations.
-       *
-       *
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store(element_type *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer) const {
-        getThis()->scatter(ptr, self_type::s_segmented_offsets(segbits, stride_inner, stride_outer));
-        return *getThis();
-      }
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[offsets.get(i)] = getThis()->get(i);
+    }
+    return *getThis();
+  }
 
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays where we load partial segments.
-       *
-       *
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store_nm(element_type *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer) const
-      {
 
-        camp::idx_t num_segments = self_type::s_num_elem >> segbits;
-        camp::idx_t seg_size = 1 << segbits;
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays.
+   *
+   * The default operation combines the s_segmented_offsets and gather
+   * operations.
+   *
+   *
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& segmented_store(element_type* ptr,
+                                   camp::idx_t segbits,
+                                   camp::idx_t stride_inner,
+                                   camp::idx_t stride_outer) const
+  {
+    getThis()->scatter(ptr, self_type::s_segmented_offsets(
+                                segbits, stride_inner, stride_outer));
+    return *getThis();
+  }
 
-        camp::idx_t lane = 0;
-        for(camp::idx_t seg = 0;seg < num_segments; ++ seg){
-          for(camp::idx_t i = 0;i < seg_size; ++ i){
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays where we load partial segments.
+   *
+   *
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& segmented_store_nm(element_type* ptr,
+                                      camp::idx_t segbits,
+                                      camp::idx_t stride_inner,
+                                      camp::idx_t stride_outer,
+                                      camp::idx_t num_inner,
+                                      camp::idx_t num_outer) const
+  {
 
-            if(!(seg >= num_outer || i >= num_inner)){
+    camp::idx_t num_segments = self_type::s_num_elem >> segbits;
+    camp::idx_t seg_size     = 1 << segbits;
 
-              camp::idx_t offset = seg*stride_outer + i*stride_inner;
+    camp::idx_t lane = 0;
+    for (camp::idx_t seg = 0; seg < num_segments; ++seg)
+    {
+      for (camp::idx_t i = 0; i < seg_size; ++i)
+      {
 
-              ptr[offset] = getThis()->get(lane);
+        if (!(seg >= num_outer || i >= num_inner))
+        {
 
-            }
+          camp::idx_t offset = seg * stride_outer + i * stride_inner;
 
-            lane ++;
-          }
+          ptr[offset] = getThis()->get(lane);
         }
 
-        return *getThis();
-      }
-
-      /*!
-       * @brief Set entire register to a single scalar value
-       * @param value Value to set all register elements to
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(element_type value)
-      {
-        getThis()->broadcast(value);
-        return *getThis();
+        lane++;
       }
+    }
 
-      /*!
-       * @brief Set entire register to a single scalar value
-       * @param value Value to set all register elements to
-       */
-      RAJA_SUPPRESS_HD_WARN
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(RAJA::expt::Register<T2, RAJA::expt::scalar_register> const &value)
-      {
-        getThis()->broadcast(value.get(0));
-        return *getThis();
-      }
+    return *getThis();
+  }
 
-      /*!
-       * @brief Assign one register to another
-       * @param x register to copy
-       * @return Value of (*this)
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(self_type const &x)
-      {
-        getThis()->copy(x);
-        return *getThis();
-      }
+  /*!
+   * @brief Set entire register to a single scalar value
+   * @param value Value to set all register elements to
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator=(element_type value)
+  {
+    getThis()->broadcast(value);
+    return *getThis();
+  }
 
+  /*!
+   * @brief Set entire register to a single scalar value
+   * @param value Value to set all register elements to
+   */
+  RAJA_SUPPRESS_HD_WARN
+  template <typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type&
+  operator=(RAJA::expt::Register<T2, RAJA::expt::scalar_register> const& value)
+  {
+    getThis()->broadcast(value.get(0));
+    return *getThis();
+  }
 
+  /*!
+   * @brief Assign one register to another
+   * @param x register to copy
+   * @return Value of (*this)
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator=(self_type const& x)
+  {
+    getThis()->copy(x);
+    return *getThis();
+  }
 
 
+  /*!
+   * @brief Add two registers
+   * @param x register to add
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator+(self_type const& x) const { return getThis()->add(x); }
 
-      /*!
-       * @brief Add two registers
-       * @param x register to add
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator+(self_type const &x) const
-      {
-        return getThis()->add(x);
-      }
 
+  /*!
+   * @brief Add a register to this register
+   * @param x register to add
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator+=(self_type const& x)
+  {
+    *getThis() = getThis()->add(x);
+    return *getThis();
+  }
 
-      /*!
-       * @brief Add a register to this register
-       * @param x register to add
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator+=(self_type const &x)
-      {
-        *getThis() = getThis()->add(x);
-        return *getThis();
-      }
+  /*!
+   * @brief Add scalar to this register
+   * @param x scalar to add to this register
+   * @return Value of (*this)+x
+   *
+   * This broadcasts the scalar to all lanes, then adds to this register
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator+(element_type const& x) const { return getThis()->add(x); }
 
-      /*!
-       * @brief Add scalar to this register
-       * @param x scalar to add to this register
-       * @return Value of (*this)+x
-       *
-       * This broadcasts the scalar to all lanes, then adds to this register
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator+(element_type const &x) const
-      {
-        return getThis()->add(x);
-      }
 
+  /*!
+   * @brief Add a scalar to this register
+   * @param x scalar to add to this register
+   * @return Value of (*this)+x
+   *
+   * This broadcasts the scalar to all lanes, then adds to this register
+   *
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator+=(element_type x)
+  {
+    *getThis() = getThis()->add(x);
+    return *getThis();
+  }
 
-      /*!
-       * @brief Add a scalar to this register
-       * @param x scalar to add to this register
-       * @return Value of (*this)+x
-       *
-       * This broadcasts the scalar to all lanes, then adds to this register
-       *
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator+=(element_type x)
-      {
-        *getThis() = getThis()->add(x);
-        return *getThis();
-      }
+  /*!
+   * @brief Negate the value of this register
+   * @return Value of -(*this)
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator-() const { return self_type(0).subtract(*getThis()); }
 
-      /*!
-       * @brief Negate the value of this register
-       * @return Value of -(*this)
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-() const
-      {
-        return self_type(0).subtract(*getThis());
-      }
+  /*!
+   * @brief Subtract two register registers
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator-(self_type const& x) const
+  {
+    return getThis()->subtract(x);
+  }
 
-      /*!
-       * @brief Subtract two register registers
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-(self_type const &x) const
-      {
-        return getThis()->subtract(x);
-      }
+  /*!
+   * @brief Subtract a register from this register
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator-=(self_type const& x)
+  {
+    *getThis() = getThis()->subtract(x);
+    return *getThis();
+  }
 
-      /*!
-       * @brief Subtract a register from this register
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator-=(self_type const &x)
-      {
-        *getThis() = getThis()->subtract(x);
-        return *getThis();
-      }
+  /*!
+   * @brief Subtract scalar from this register
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator-(element_type const& x) const
+  {
+    return getThis()->subtract(x);
+  }
 
-      /*!
-       * @brief Subtract scalar from this register
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-(element_type const &x) const
-      {
-        return getThis()->subtract(x);
-      }
+  /*!
+   * @brief Subtract a scalar from this register
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator-=(element_type const& x)
+  {
+    *getThis() = getThis()->subtract(x);
+    return *getThis();
+  }
 
-      /*!
-       * @brief Subtract a scalar from this register
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator-=(element_type const &x)
-      {
-        *getThis() = getThis()->subtract(x);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Multiply two register registers, element wise
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      template<typename RHS>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator*(RHS const &rhs) const
-      {
-        return getThis()->multiply(rhs);
-      }
+  /*!
+   * @brief Multiply two register registers, element wise
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type operator*(RHS const& rhs) const
+  {
+    return getThis()->multiply(rhs);
+  }
 
-      /*!
-       * @brief Multiply a register with this register
-       * @param x register to multiple with this register
-       * @return Value of (*this)+x
-       */
-      template<typename RHS>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator*=(RHS const &rhs)
-      {
-        *getThis() = getThis()->multiply(rhs);
-        return *getThis();
-      }
+  /*!
+   * @brief Multiply a register with this register
+   * @param x register to multiple with this register
+   * @return Value of (*this)+x
+   */
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& operator*=(RHS const& rhs)
+  {
+    *getThis() = getThis()->multiply(rhs);
+    return *getThis();
+  }
 
-      /*!
-       * @brief Divide two register registers, element wise
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type operator/(self_type const &x) const
-      {
-        return getThis()->divide(x);
-      }
+  /*!
+   * @brief Divide two register registers, element wise
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type operator/(self_type const& x) const { return getThis()->divide(x); }
 
-      /*!
-       * @brief Divide this register by another register
-       * @param x register to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator/=(self_type const &x)
-      {
-        *getThis() = getThis()->divide(x);
-        return *getThis();
-      }
+  /*!
+   * @brief Divide this register by another register
+   * @param x register to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator/=(self_type const& x)
+  {
+    *getThis() = getThis()->divide(x);
+    return *getThis();
+  }
 
 
-      /*!
-       * @brief Divide by a scalar, element wise
-       * @param x Scalar to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type operator/(element_type const &x) const
-      {
-        return getThis()->divide(x);
-      }
+  /*!
+   * @brief Divide by a scalar, element wise
+   * @param x Scalar to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type operator/(element_type const& x) const
+  {
+    return getThis()->divide(x);
+  }
 
-      /*!
-       * @brief Divide this register by another register
-       * @param x Scalar to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator/=(element_type const &x)
-      {
-        *getThis() = getThis()->divide(x);
-        return *getThis();
-      }
+  /*!
+   * @brief Divide this register by another register
+   * @param x Scalar to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator/=(element_type const& x)
+  {
+    *getThis() = getThis()->divide(x);
+    return *getThis();
+  }
 
 
-      /*!
-       * @brief Divide n elements of this register by another register
-       * @param x register to divide by
-       * @param n Number of elements to divide
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t n) const {
-        self_type q(*getThis());
-        for(camp::idx_t i = 0;i < n;++i){
-          q.set(getThis()->get(i) / b.get(i), i);
-        }
-        return q;
-      }
+  /*!
+   * @brief Divide n elements of this register by another register
+   * @param x register to divide by
+   * @param n Number of elements to divide
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t n) const
+  {
+    self_type q(*getThis());
+    for (camp::idx_t i = 0; i < n; ++i)
+    {
+      q.set(getThis()->get(i) / b.get(i), i);
+    }
+    return q;
+  }
 
-      /*!
-       * @brief Divide n elements of this register by a scalar
-       * @param x Scalar to divide by
-       * @param n Number of elements to divide
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(element_type const &b, camp::idx_t n) const {
-        self_type q(*getThis());
-        for(camp::idx_t i = 0;i < n;++i){
-          q.set(getThis()->get(i) / b, i);
-        }
-        return q;
-      }
+  /*!
+   * @brief Divide n elements of this register by a scalar
+   * @param x Scalar to divide by
+   * @param n Number of elements to divide
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(element_type const& b, camp::idx_t n) const
+  {
+    self_type q(*getThis());
+    for (camp::idx_t i = 0; i < n; ++i)
+    {
+      q.set(getThis()->get(i) / b, i);
+    }
+    return q;
+  }
 
-      /*!
-       * @brief Dot product of two registers
-       * @param x Other register to dot with this register
-       * @return Value of (*this) dot x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type dot(self_type const &x) const
-      {
-        return getThis()->multiply(x).sum();
-      }
+  /*!
+   * @brief Dot product of two registers
+   * @param x Other register to dot with this register
+   * @return Value of (*this) dot x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type dot(self_type const& x) const
+  {
+    return getThis()->multiply(x).sum();
+  }
 
-      /*!
-       * @brief Fused multiply add: fma(b, c) = (*this)*b+c
-       *
-       * Derived types can override this to implement intrinsic FMA's
-       *
-       * @param b Second product operand
-       * @param c Sum operand
-       * @return Value of (*this)*b+c
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return (self_type(*getThis()) * self_type(b)) + self_type(c);
-      }
+  /*!
+   * @brief Fused multiply add: fma(b, c) = (*this)*b+c
+   *
+   * Derived types can override this to implement intrinsic FMA's
+   *
+   * @param b Second product operand
+   * @param c Sum operand
+   * @return Value of (*this)*b+c
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const& b, self_type const& c) const
+  {
+    return (self_type(*getThis()) * self_type(b)) + self_type(c);
+  }
 
-      /*!
-       * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
-       *
-       * Derived types can override this to implement intrinsic FMS's
-       *
-       * @param b Second product operand
-       * @param c Subtraction operand
-       * @return Value of (*this)*b-c
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return getThis()->multiply_add(b, -c);
-      }
+  /*!
+   * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
+   *
+   * Derived types can override this to implement intrinsic FMS's
+   *
+   * @param b Second product operand
+   * @param c Subtraction operand
+   * @return Value of (*this)*b-c
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return getThis()->multiply_add(b, -c);
+  }
 
-      /*!
-       * Multiply this tensor by a scalar value
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type scale(element_type c) const
-      {
-        return getThis()->multiply(self_type(c));
-      }
+  /*!
+   * Multiply this tensor by a scalar value
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type scale(element_type c) const
+  {
+    return getThis()->multiply(self_type(c));
+  }
 
-      /*!
-       * Minimum value across first N lanes of register
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type min_n(camp::idx_t N) const
-      {
-        return getThis()->min(N);
-      }
+  /*!
+   * Minimum value across first N lanes of register
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type min_n(camp::idx_t N) const { return getThis()->min(N); }
 
-      /*!
-       * Maximum value across first N lanes of register
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type max_n(camp::idx_t N) const
-      {
-        return getThis()->max(N);
-      }
+  /*!
+   * Maximum value across first N lanes of register
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type max_n(camp::idx_t N) const { return getThis()->max(N); }
 
-      /*!
-       * Provides vector-level building block for matrix transpose operations.
-       *
-       * This is a non-optimized reference version which will be used if
-       * no architecture specialized version is supplied
-       *
-       * This is a permute-and-shuffle left operation
-       *
-       *           X=   x0  x1  x2  x3  x4  x5  x6  x7...
-       *           Y=   y0  y1  y2  y3  y4  y5  y6  y7...
-       *
-       *  lvl=0    Z=   x0  y0  x2  y2  x4  y4  x6  y6...
-       *  lvl=1    Z=   x0  x1  y0  y1  x4  x5  y4  y5...
-       *  lvl=2    Z=   x0  x1  x2  x3  y0  y1  y2  y3...
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type transpose_shuffle_left(camp::idx_t lvl, self_type const &y) const
-      {
-        auto const &x = *getThis();
+  /*!
+   * Provides vector-level building block for matrix transpose operations.
+   *
+   * This is a non-optimized reference version which will be used if
+   * no architecture specialized version is supplied
+   *
+   * This is a permute-and-shuffle left operation
+   *
+   *           X=   x0  x1  x2  x3  x4  x5  x6  x7...
+   *           Y=   y0  y1  y2  y3  y4  y5  y6  y7...
+   *
+   *  lvl=0    Z=   x0  y0  x2  y2  x4  y4  x6  y6...
+   *  lvl=1    Z=   x0  x1  y0  y1  x4  x5  y4  y5...
+   *  lvl=2    Z=   x0  x1  x2  x3  y0  y1  y2  y3...
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type transpose_shuffle_left(camp::idx_t lvl, self_type const& y) const
+  {
+    auto const& x = *getThis();
 
-        self_type z;
+    self_type z;
 
-        for(camp::idx_t i = 0;i < self_type::s_num_elem;++ i){
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
 
-          // extract value x or y
-          camp::idx_t xy_select = (i >> lvl) & 0x1;
+      // extract value x or y
+      camp::idx_t xy_select = (i >> lvl) & 0x1;
 
 
-          z.set(xy_select == 0 ? x.get(i) : y.get(i - (1<<lvl)), i);
-        }
+      z.set(xy_select == 0 ? x.get(i) : y.get(i - (1 << lvl)), i);
+    }
 
-        return z;
-      }
+    return z;
+  }
 
 
-      /*!
-       * Provides vector-level building block for matrix transpose operations.
-       *
-       * This is a non-optimized reference version which will be used if
-       * no architecture specialized version is supplied
-       *
-       * This is a permute-and-shuffle right operation
-       *
-       *           X=   x0  x1  x2  x3  x4  x5  x6  x7...
-       *           Y=   y0  y1  y2  y3  y4  y5  y6  y7...
-       *
-       *  lvl=0    Z=   x1  y1  x3  y3  x5  y5  x7  y7...
-       *  lvl=1    Z=   x2  x3  y2  y3  x6  x7  y6  y7...
-       *  lvl=2    Z=   x4  x5  x6  x7  y4  y5  y6  y7...
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type transpose_shuffle_right(int lvl, self_type const &y) const
-      {
-        auto const &x = *getThis();
+  /*!
+   * Provides vector-level building block for matrix transpose operations.
+   *
+   * This is a non-optimized reference version which will be used if
+   * no architecture specialized version is supplied
+   *
+   * This is a permute-and-shuffle right operation
+   *
+   *           X=   x0  x1  x2  x3  x4  x5  x6  x7...
+   *           Y=   y0  y1  y2  y3  y4  y5  y6  y7...
+   *
+   *  lvl=0    Z=   x1  y1  x3  y3  x5  y5  x7  y7...
+   *  lvl=1    Z=   x2  x3  y2  y3  x6  x7  y6  y7...
+   *  lvl=2    Z=   x4  x5  x6  x7  y4  y5  y6  y7...
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type transpose_shuffle_right(int lvl, self_type const& y) const
+  {
+    auto const& x = *getThis();
 
-        self_type z;
+    self_type z;
 
-        camp::idx_t i0 = 1<<lvl;
+    camp::idx_t i0 = 1 << lvl;
 
-        for(camp::idx_t i = 0;i < self_type::s_num_elem;++ i){
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
 
-          // extract value x or y
-          camp::idx_t xy_select = (i >> lvl) & 0x1;
+      // extract value x or y
+      camp::idx_t xy_select = (i >> lvl) & 0x1;
 
-          z.set(xy_select == 0 ? x.get(i0 + i) : y.get(i0 + i - (1<<lvl)), i);
-        }
+      z.set(xy_select == 0 ? x.get(i0 + i) : y.get(i0 + i - (1 << lvl)), i);
+    }
 
-        return z;
-      }
+    return z;
+  }
 
 
+  /*!
+   * Provides gather/scatter indices for segmented loads and stores
+   *
+   * THe number of segment bits (segbits) is specified, as well as the
+   * stride between elements in a segment (stride_inner),
+   * and the stride between segments (stride_outer)
+   */
+  RAJA_INLINE
+  static int_vector_type s_segmented_offsets(camp::idx_t segbits,
+                                             camp::idx_t stride_inner,
+                                             camp::idx_t stride_outer)
+  {
+    int_vector_type result;
 
+    camp::idx_t num_segments = self_type::s_num_elem >> segbits;
+    camp::idx_t seg_size     = 1 << segbits;
 
-      /*!
-       * Provides gather/scatter indices for segmented loads and stores
-       *
-       * THe number of segment bits (segbits) is specified, as well as the
-       * stride between elements in a segment (stride_inner),
-       * and the stride between segments (stride_outer)
-       */
-      RAJA_INLINE
-      static
-      int_vector_type s_segmented_offsets(camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer)
+    camp::idx_t lane = 0;
+    for (camp::idx_t seg = 0; seg < num_segments; ++seg)
+    {
+      for (camp::idx_t i = 0; i < seg_size; ++i)
       {
-        int_vector_type result;
-
-        camp::idx_t num_segments = self_type::s_num_elem >> segbits;
-        camp::idx_t seg_size = 1 << segbits;
-
-        camp::idx_t lane = 0;
-        for(camp::idx_t seg = 0;seg < num_segments; ++ seg){
-          for(camp::idx_t i = 0;i < seg_size; ++ i){
-            result.set(seg*stride_outer + i*stride_inner, lane);
-            lane ++;
-          }
-        }
-
-        return result;
+        result.set(seg * stride_outer + i * stride_inner, lane);
+        lane++;
       }
+    }
 
+    return result;
+  }
 
-      /*!
-       * Sum elements within each segment, with segment size defined by segbits.
-       * Stores each segments sum consecutively, but shifed to the
-       * corresponding output_segment slot.
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 is equivalent to the input vector,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=1 sums neighboring pairs of values.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
-       *
-       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-       *      output_segment denotes the vector position of the sum
-       *
-       */
-      RAJA_INLINE
-      self_type segmented_sum_inner(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-        self_type result(0);
-
-        // default implementation is dumb, just sum each value into
-        // appropriate segment lane
-        int output_offset = output_segment * self_type::s_num_elem>>segbits;
-
-        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
-          auto value = getThis()->get(i) + result.get((i >> segbits)+output_offset);
-          result.set(value, (i >> segbits)+output_offset);
-        }
 
-        return result;
-      }
-
-      /*!
-       * Sum all segments as subvectors, with segment size defined by segbits
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 the segments are size 1, which means that this is just a
-       *      sum of all elements.  The output_segment determines where the
-       *      result is placed.
-       *
-       *      output_segment=0:
-       *      Result= x0+x1+x2+x3+x4+x5+x6+x7, 0, 0, 0, 0, 0, 0, 0, 0
-       *
-       *      output_segment=3:
-       *      Result= 0, 0, x0+x1+x2+x3+x4+x5+x6+x7, 0, 0, 0, 0, 0, 0
-       *
-       *  segbits=1 the segments are 2-wide:
-       *
-       *      output_segment=0:
-       *      Result= x0+x2+x4+x6, x1+x3+x5+x7, 0, 0, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, x0+x2+x4+x6, x1+x3+x5+x7, 0, 0, 0, 0
-       *
-       *  and so on up to segbits=3, which is just the original vector:
-       *  segbits=3
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       */
-      RAJA_INLINE
-      self_type segmented_sum_outer(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-        self_type result(0);
-
-        // default implementation is dumb, just sum each value into
-        // appropriate segment lane
-        int output_offset = output_segment * (1<<segbits);
+  /*!
+   * Sum elements within each segment, with segment size defined by segbits.
+   * Stores each segments sum consecutively, but shifed to the
+   * corresponding output_segment slot.
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 is equivalent to the input vector,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=1 sums neighboring pairs of values.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
+   *
+   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+   *      output_segment denotes the vector position of the sum
+   *
+   */
+  RAJA_INLINE
+  self_type segmented_sum_inner(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+    self_type result(0);
 
-        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
-          camp::idx_t output_i = output_offset + (i&((1<<segbits)-1));
-          auto value = getThis()->get(i) + result.get(output_i);
-          result.set(value, output_i);
-        }
+    // default implementation is dumb, just sum each value into
+    // appropriate segment lane
+    int output_offset = output_segment * self_type::s_num_elem >> segbits;
 
-        return result;
-      }
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
+      auto value =
+          getThis()->get(i) + result.get((i >> segbits) + output_offset);
+      result.set(value, (i >> segbits) + output_offset);
+    }
 
+    return result;
+  }
 
+  /*!
+   * Sum all segments as subvectors, with segment size defined by segbits
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 the segments are size 1, which means that this is just a
+   *      sum of all elements.  The output_segment determines where the
+   *      result is placed.
+   *
+   *      output_segment=0:
+   *      Result= x0+x1+x2+x3+x4+x5+x6+x7, 0, 0, 0, 0, 0, 0, 0, 0
+   *
+   *      output_segment=3:
+   *      Result= 0, 0, x0+x1+x2+x3+x4+x5+x6+x7, 0, 0, 0, 0, 0, 0
+   *
+   *  segbits=1 the segments are 2-wide:
+   *
+   *      output_segment=0:
+   *      Result= x0+x2+x4+x6, x1+x3+x5+x7, 0, 0, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, x0+x2+x4+x6, x1+x3+x5+x7, 0, 0, 0, 0
+   *
+   *  and so on up to segbits=3, which is just the original vector:
+   *  segbits=3
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   */
+  RAJA_INLINE
+  self_type segmented_sum_outer(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+    self_type result(0);
 
-      RAJA_INLINE
-      self_type segmented_divide_nm(self_type den, camp::idx_t segbits, camp::idx_t num_inner, camp::idx_t num_outer) const
-      {
-        self_type result;
+    // default implementation is dumb, just sum each value into
+    // appropriate segment lane
+    int output_offset = output_segment * (1 << segbits);
 
-        camp::idx_t num_segments = self_type::s_num_elem >> segbits;
-        camp::idx_t seg_size = 1 << segbits;
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
+      camp::idx_t output_i = output_offset + (i & ((1 << segbits) - 1));
+      auto value           = getThis()->get(i) + result.get(output_i);
+      result.set(value, output_i);
+    }
 
-        camp::idx_t lane = 0;
-        for(camp::idx_t seg = 0;seg < num_segments; ++ seg){
-          for(camp::idx_t i = 0;i < seg_size; ++ i){
+    return result;
+  }
 
-            if(seg >= num_outer || i >= num_inner){
-              result.set(element_type(0), lane);
-            }
-            else{
 
-              element_type div = getThis()->get(lane) / den.get(lane);
+  RAJA_INLINE
+  self_type segmented_divide_nm(self_type den,
+                                camp::idx_t segbits,
+                                camp::idx_t num_inner,
+                                camp::idx_t num_outer) const
+  {
+    self_type result;
 
-              result.set(div, lane);
+    camp::idx_t num_segments = self_type::s_num_elem >> segbits;
+    camp::idx_t seg_size     = 1 << segbits;
 
-            }
+    camp::idx_t lane = 0;
+    for (camp::idx_t seg = 0; seg < num_segments; ++seg)
+    {
+      for (camp::idx_t i = 0; i < seg_size; ++i)
+      {
 
-            lane ++;
-          }
+        if (seg >= num_outer || i >= num_inner)
+        {
+          result.set(element_type(0), lane);
         }
+        else
+        {
 
-        return result;
-      }
-
+          element_type div = getThis()->get(lane) / den.get(lane);
 
+          result.set(div, lane);
+        }
 
-      /*!
-       * Segmented dot product performs dot products
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *                      Y = y0, y1, y2, y3, y4, y5, y6, y7
-       *
-       *
-       *  segbits=0 is equivalent to a vector multiply,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0*y0, x1*y1, x2*y2, x3*y3, x4*y4, x5*y5, x6*y6, x7*y7
-       *
-       *  segbits=1 sums neighboring pairs of products.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0*y0+x1*y1, x2*y2+x3*y3, x4*y4+x5*y5, x6*y6+x7*y7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0*y0+x1*y1, x2*y2+x3*y3, x4*y4+x5*y5, x6*y6+x7*y7
-       *
-       *  and so on up to segbits=3, which is a full dot-product of x and y, and the
-       *      output_segment denotes the vector position of the result
-       *
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type segmented_dot(camp::idx_t segbits, camp::idx_t output_segment, self_type const &x) const
-      {
-        return getThis()->multiply(x).segmented_sum_inner(segbits, output_segment);
+        lane++;
       }
+    }
 
-      /*!
-       * Segmented broadcast copies a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      input_segment=0:
-       *      Result= x0, x1, x0, x1, x0, x1, x0, x1
-       *
-       *      input_segment=1:
-       *      Result= x2, x3, x2, x3, x2, x3, x2, x3
-       *
-       *      input_segment=3:
-       *      Result= x6, x7, x6, x7, x6, x7, x6, x7
-       *
-       *  and so on up to segbits=2, the input segments are 4 wide:
-       *
-       *      input segments allowed are from 0 or 1
-       *
-       *      input_segment=0:
-       *      Result= x0, x1, x2, x3, x0, x1, x2, x3
-       *
-       *      input_segment=1:
-       *      Result= x4, x5, x6, x7, x4, x5, x6, x7
-       *
-       */
-      RAJA_INLINE
-      self_type segmented_broadcast_inner(camp::idx_t segbits, camp::idx_t input_segment) const
-      {
-        self_type result;
-
-        camp::idx_t mask = (1<<segbits)-1;
-        camp::idx_t offset = input_segment << segbits;
-
-        // default implementation is dumb, just sum each value into
-        // appropriate segment lane
-        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
+    return result;
+  }
 
-          auto off = (i&mask) + offset;
 
-          result.set(getThis()->get(off), i);
-        }
+  /*!
+   * Segmented dot product performs dot products
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *                      Y = y0, y1, y2, y3, y4, y5, y6, y7
+   *
+   *
+   *  segbits=0 is equivalent to a vector multiply,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0*y0, x1*y1, x2*y2, x3*y3, x4*y4, x5*y5, x6*y6, x7*y7
+   *
+   *  segbits=1 sums neighboring pairs of products.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0*y0+x1*y1, x2*y2+x3*y3, x4*y4+x5*y5, x6*y6+x7*y7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0*y0+x1*y1, x2*y2+x3*y3, x4*y4+x5*y5, x6*y6+x7*y7
+   *
+   *  and so on up to segbits=3, which is a full dot-product of x and y, and the
+   *      output_segment denotes the vector position of the result
+   *
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type segmented_dot(camp::idx_t segbits,
+                          camp::idx_t output_segment,
+                          self_type const& x) const
+  {
+    return getThis()->multiply(x).segmented_sum_inner(segbits, output_segment);
+  }
 
-        return result;
-      }
+  /*!
+   * Segmented broadcast copies a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      input_segment=0:
+   *      Result= x0, x1, x0, x1, x0, x1, x0, x1
+   *
+   *      input_segment=1:
+   *      Result= x2, x3, x2, x3, x2, x3, x2, x3
+   *
+   *      input_segment=3:
+   *      Result= x6, x7, x6, x7, x6, x7, x6, x7
+   *
+   *  and so on up to segbits=2, the input segments are 4 wide:
+   *
+   *      input segments allowed are from 0 or 1
+   *
+   *      input_segment=0:
+   *      Result= x0, x1, x2, x3, x0, x1, x2, x3
+   *
+   *      input_segment=1:
+   *      Result= x4, x5, x6, x7, x4, x5, x6, x7
+   *
+   */
+  RAJA_INLINE
+  self_type segmented_broadcast_inner(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
 
+    camp::idx_t mask   = (1 << segbits) - 1;
+    camp::idx_t offset = input_segment << segbits;
 
-      /*!
-       * Segmented broadcast spreads a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      output_segment=0:
-       *      Result= x0, x0, x0, x0, x1, x1, x1, x1
-       *
-       *      output_segment=1:
-       *      Result= x2, x2, x2, x2, x3, x3, x3, x3
-       *
-       *      output_segment=3:
-       *      Result= x6, x6, x6, x6, x7, x7, x7, x7
-       */
-      RAJA_INLINE
-      self_type segmented_broadcast_outer(camp::idx_t segbits, camp::idx_t input_segment) const
-      {
-        self_type result;
+    // default implementation is dumb, just sum each value into
+    // appropriate segment lane
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
 
-        camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
+      auto off = (i & mask) + offset;
 
-        // default implementation is dumb, just sum each value into
-        // appropriate segment lane
-        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
+      result.set(getThis()->get(off), i);
+    }
 
-          auto off = (i>>segbits) + offset;
+    return result;
+  }
 
-          result.set(getThis()->get(off), i);
-        }
 
-        return result;
-      }
+  /*!
+   * Segmented broadcast spreads a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      output_segment=0:
+   *      Result= x0, x0, x0, x0, x1, x1, x1, x1
+   *
+   *      output_segment=1:
+   *      Result= x2, x2, x2, x2, x3, x3, x3, x3
+   *
+   *      output_segment=3:
+   *      Result= x6, x6, x6, x6, x7, x7, x7, x7
+   */
+  RAJA_INLINE
+  self_type segmented_broadcast_outer(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
 
+    camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
 
+    // default implementation is dumb, just sum each value into
+    // appropriate segment lane
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
 
+      auto off = (i >> segbits) + offset;
 
+      result.set(getThis()->get(off), i);
+    }
 
-      /*!
-       * @brief Converts to vector to a string
-       *
-       *
-       */
-      RAJA_INLINE
-      std::string to_string() const {
-        std::string s = "Register(" + std::to_string(self_type::s_num_elem) + ")[ ";
+    return result;
+  }
 
-        //
-        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
-          s += std::to_string(getThis()->get(i)) + " ";
-        }
 
-        s += " ]\n";
+  /*!
+   * @brief Converts to vector to a string
+   *
+   *
+   */
+  RAJA_INLINE
+  std::string to_string() const
+  {
+    std::string s = "Register(" + std::to_string(self_type::s_num_elem) + ")[ ";
 
-        return s;
-      }
+    //
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
+      s += std::to_string(getThis()->get(i)) + " ";
+    }
 
-  };
+    s += " ]\n";
 
+    return s;
+  }
+};
 
-} // namespace expt
-} // namespace internal
-} // namespace RAJA
 
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp b/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
index bb53993fed..c92921df2a 100644
--- a/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
@@ -27,347 +27,277 @@ namespace RAJA
 
 namespace internal
 {
-    /* Partial specialization for the strip_index_type_t helper in
-       IndexValue.hpp
-    */
-    template<typename IDX, typename VECTOR_TYPE, camp::idx_t DIM>
-    struct StripIndexTypeT<RAJA::expt::TensorIndex<IDX, VECTOR_TYPE, DIM>>
-    {
-        using type = typename RAJA::expt::TensorIndex<IDX, VECTOR_TYPE, DIM>::value_type;
-    };
+/* Partial specialization for the strip_index_type_t helper in
+   IndexValue.hpp
+*/
+template <typename IDX, typename VECTOR_TYPE, camp::idx_t DIM>
+struct StripIndexTypeT<RAJA::expt::TensorIndex<IDX, VECTOR_TYPE, DIM>>
+{
+  using type =
+      typename RAJA::expt::TensorIndex<IDX, VECTOR_TYPE, DIM>::value_type;
+};
 
 
 namespace expt
 {
 
 
+// Helper that strips the Vector type from an argument
+template <typename ARG>
+struct TensorIndexTraits
+{
+  using arg_type   = ARG;
+  using value_type = strip_index_type_t<ARG>;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr bool isTensorIndex() { return false; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr arg_type const& strip(arg_type const& arg) { return arg; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr arg_type const strip_by_value(arg_type const arg)
+  {
+    return arg;
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type size(arg_type const&) { return 1; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type begin(arg_type const&) { return 0; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type dim() { return 0; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type num_elem() { return 1; }
+};
+
+template <typename IDX, typename TENSOR_TYPE, camp::idx_t DIM>
+struct TensorIndexTraits<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>>
+{
+  using index_type = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
+  using arg_type   = IDX;
+  using value_type = strip_index_type_t<IDX>;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr bool isTensorIndex() { return true; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr arg_type const& strip(index_type const& arg) { return *arg; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr arg_type const strip_by_value(index_type const arg)
+  {
+    return (arg_type)arg;
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type size(index_type const& arg) { return arg.size(); }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type begin(index_type const& arg)
+  {
+    return arg.begin();
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type dim() { return DIM; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type num_elem()
+  {
+    return TENSOR_TYPE::s_dim_elem(DIM);
+  }
+};
+
+
+template <typename IDX,
+          typename TENSOR_TYPE,
+          camp::idx_t DIM,
+          IDX INDEX_VALUE,
+          strip_index_type_t<IDX> LENGTH_VALUE>
+struct TensorIndexTraits<RAJA::expt::StaticTensorIndex<
+    RAJA::expt::StaticTensorIndexInner<IDX,
+                                       TENSOR_TYPE,
+                                       DIM,
+                                       INDEX_VALUE,
+                                       LENGTH_VALUE>>>
+{
+  using base_type  = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
+  using index_type = RAJA::expt::StaticTensorIndex<
+      RAJA::expt::StaticTensorIndexInner<IDX,
+                                         TENSOR_TYPE,
+                                         DIM,
+                                         INDEX_VALUE,
+                                         LENGTH_VALUE>>;
+  using arg_type   = IDX;
+  using value_type = strip_index_type_t<IDX>;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr bool isTensorIndex() { return true; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr arg_type const strip_by_value(index_type const)
+  {
+    return INDEX_VALUE;
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type size(index_type const&) { return LENGTH_VALUE; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type begin(index_type const&) { return INDEX_VALUE; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type dim() { return DIM; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type num_elem()
+  {
+    return TENSOR_TYPE::s_dim_elem(DIM);
+  }
+};
+
+/*
+ * Returns vector size of argument.
+ *
+ * For scalars, always returns 1.
+ *
+ * For VectorIndex types, returns the number of vector lanes.
+ */
+template <typename ARG>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr bool isTensorIndex()
+{
+  return TensorIndexTraits<ARG>::isTensorIndex();
+}
 
+template <typename ARG>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr auto stripTensorIndex(ARG const& arg) ->
+    typename TensorIndexTraits<ARG>::arg_type const&
+{
+  return TensorIndexTraits<ARG>::strip(arg);
+}
 
 
-
-    // Helper that strips the Vector type from an argument
-    template<typename ARG>
-    struct TensorIndexTraits {
-        using arg_type = ARG;
-        using value_type = strip_index_type_t<ARG>;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        bool isTensorIndex(){
-          return false;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        arg_type const &strip(arg_type const &arg){
-          return arg;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        arg_type const strip_by_value(arg_type const arg){
-          return arg;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type size(arg_type const &){
-          return 1;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type begin(arg_type const &){
-          return 0;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type dim(){
-          return 0;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type num_elem(){
-          return 1;
-        }
-    };
-
-    template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM>
-    struct TensorIndexTraits<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>> {
-        using index_type = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
-        using arg_type = IDX;
-        using value_type = strip_index_type_t<IDX>;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        bool isTensorIndex(){
-          return true;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        arg_type const &strip(index_type const &arg){
-          return *arg;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        arg_type const strip_by_value(index_type const arg){
-          return (arg_type)arg;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type size(index_type const &arg){
-          return arg.size();
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type begin(index_type const &arg){
-          return arg.begin();
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type dim(){
-          return DIM;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type num_elem(){
-          return TENSOR_TYPE::s_dim_elem(DIM);
-        }
-    };
-
-
-
-
-    template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, IDX INDEX_VALUE, strip_index_type_t<IDX> LENGTH_VALUE>
-    struct TensorIndexTraits<RAJA::expt::StaticTensorIndex<
-        RAJA::expt::StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, INDEX_VALUE, LENGTH_VALUE>
-    >> {
-        using base_type = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
-        using index_type = RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, INDEX_VALUE, LENGTH_VALUE>>;
-        using arg_type = IDX;
-        using value_type = strip_index_type_t<IDX>;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        bool isTensorIndex(){
-          return true;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        arg_type const strip_by_value(index_type const){
-          return INDEX_VALUE;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type size(index_type const &){
-          return LENGTH_VALUE;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type begin(index_type const &){
-          return INDEX_VALUE;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type dim(){
-          return DIM;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type num_elem(){
-          return TENSOR_TYPE::s_dim_elem(DIM);
-        }
-    };
-
-    /*
-     * Returns vector size of argument.
-     *
-     * For scalars, always returns 1.
-     *
-     * For VectorIndex types, returns the number of vector lanes.
-     */
-    template<typename ARG>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    bool isTensorIndex()
-    {
-      return TensorIndexTraits<ARG>::isTensorIndex();
-    }
-
-    template<typename ARG>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    auto stripTensorIndex(ARG const &arg) ->
-    typename TensorIndexTraits<ARG>::arg_type const &
-    {
-      return TensorIndexTraits<ARG>::strip(arg);
-    }
-
-
-    template<typename ARG>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    auto stripTensorIndexByValue(ARG const arg) ->
+template <typename ARG>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr auto
+stripTensorIndexByValue(ARG const arg) ->
     typename TensorIndexTraits<ARG>::arg_type const
-    {
-      return TensorIndexTraits<ARG>::strip_by_value(arg);
-    }
-
-    /*
-     * Returns tensor dimension size of argument.
-     *
-     * For VectorIndex types, returns the number of vector lanes.
-     */
-    template<typename ARG, typename IDX>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    IDX getTensorSize(ARG const &arg, IDX dim_size)
-    {
-      return TensorIndexTraits<ARG>::size(arg) >= 0 ?
-          IDX(TensorIndexTraits<ARG>::size(arg)) :
-          dim_size;
-    }
-
-    /*
-     * Returns tensor dimenson beginning index of an argument.
-     *
-     */
-    template<typename ARG, typename IDX>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    IDX getTensorBegin(ARG const &arg, IDX dim_minval)
-    {
-      return TensorIndexTraits<ARG>::begin(arg) >= 0 ?
-          IDX(TensorIndexTraits<ARG>::begin(arg)) :
-          dim_minval;
-    }
-
-    /*
-     * Returns vector dim of argument.
-     *
-     * For scalars, always returns 0.
-     *
-     * For VectorIndex types, returns the DIM argument.
-     * For vector_exec, this is always 0
-     *
-     * For matrices, DIM means:
-     *   0 : Row
-     *   1 : Column
-     */
-    template<typename ARG>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    auto getTensorDim() ->
-      decltype(TensorIndexTraits<ARG>::dim())
-    {
-      return TensorIndexTraits<ARG>::dim();
-    }
-
-} // namespace expt
-
-
-    /*
-     * Lambda<N, Seg<X>>  overload that matches VectorIndex types, and properly
-     * includes the vector length with them
-     */
-    template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t id>
-    struct LambdaSegExtractor<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>, id>
-    {
-
-      template<typename Data>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      static RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM> extract(Data &&data)
-      {
-        return RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>(
-            camp::get<id>(data.segment_tuple).begin()[camp::get<id>(data.offset_tuple)],
-            camp::get<id>(data.vector_sizes));
-      }
-
-    };
-
-    /*
-     * Lambda<N, Seg<X>>  overload that matches VectorIndex types, and properly
-     * includes the vector length with them
-     */
-    template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t id>
-    struct LambdaOffsetExtractor<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>, id>
-    {
-
-      template<typename Data>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      static RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM> extract(Data &&data)
-      {
-        return RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>(
-            IDX(camp::get<id>(data.offset_tuple)), // convert offset type to IDX
-            camp::get<id>(data.vector_sizes));
-      }
-
-    };
-
-} // namespace internal
+{
+  return TensorIndexTraits<ARG>::strip_by_value(arg);
+}
+
+/*
+ * Returns tensor dimension size of argument.
+ *
+ * For VectorIndex types, returns the number of vector lanes.
+ */
+template <typename ARG, typename IDX>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr IDX getTensorSize(ARG const& arg,
+                                                         IDX dim_size)
+{
+  return TensorIndexTraits<ARG>::size(arg) >= 0
+             ? IDX(TensorIndexTraits<ARG>::size(arg))
+             : dim_size;
+}
+
+/*
+ * Returns tensor dimenson beginning index of an argument.
+ *
+ */
+template <typename ARG, typename IDX>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr IDX getTensorBegin(ARG const& arg,
+                                                          IDX dim_minval)
+{
+  return TensorIndexTraits<ARG>::begin(arg) >= 0
+             ? IDX(TensorIndexTraits<ARG>::begin(arg))
+             : dim_minval;
+}
+
+/*
+ * Returns vector dim of argument.
+ *
+ * For scalars, always returns 0.
+ *
+ * For VectorIndex types, returns the DIM argument.
+ * For vector_exec, this is always 0
+ *
+ * For matrices, DIM means:
+ *   0 : Row
+ *   1 : Column
+ */
+template <typename ARG>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr auto getTensorDim()
+    -> decltype(TensorIndexTraits<ARG>::dim())
+{
+  return TensorIndexTraits<ARG>::dim();
+}
+
+}  // namespace expt
+
+
+/*
+ * Lambda<N, Seg<X>>  overload that matches VectorIndex types, and properly
+ * includes the vector length with them
+ */
+template <typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t id>
+struct LambdaSegExtractor<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>, id>
+{
+
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static RAJA::expt::
+      TensorIndex<IDX, TENSOR_TYPE, DIM>
+      extract(Data&& data)
+  {
+    return RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>(
+        camp::get<id>(data.segment_tuple)
+            .begin()[camp::get<id>(data.offset_tuple)],
+        camp::get<id>(data.vector_sizes));
+  }
+};
+
+/*
+ * Lambda<N, Seg<X>>  overload that matches VectorIndex types, and properly
+ * includes the vector length with them
+ */
+template <typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t id>
+struct LambdaOffsetExtractor<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>, id>
+{
+
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static RAJA::expt::
+      TensorIndex<IDX, TENSOR_TYPE, DIM>
+      extract(Data&& data)
+  {
+    return RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>(
+        IDX(camp::get<id>(data.offset_tuple)),  // convert offset type to IDX
+        camp::get<id>(data.vector_sizes));
+  }
+};
+
+}  // namespace internal
 }  // namespace RAJA
 
 
diff --git a/include/RAJA/pattern/tensor/internal/TensorRef.hpp b/include/RAJA/pattern/tensor/internal/TensorRef.hpp
index 60e31f24b9..73d6f788c1 100644
--- a/include/RAJA/pattern/tensor/internal/TensorRef.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorRef.hpp
@@ -30,656 +30,750 @@ namespace internal
 namespace expt
 {
 
-    template<typename INT_SEQ>
-    struct StaticIndexArray;
-
-    template<typename INDEX_TYPE, INDEX_TYPE NEW_HEAD, typename ARRAY>
-    struct PrependStaticIndexArray;
-
-    template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, typename ARRAY >
-    struct AddStaticIndexArray;
-
-    template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, typename ARRAY >
-    struct SetStaticIndexArray;
-
-
-    template<typename INDEX_TYPE, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
-    struct StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>> {
-        
-        using seq_type = camp::int_seq<INDEX_TYPE,HEAD,TAIL...>;
-        using Self = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
-        using Tail = StaticIndexArray<camp::int_seq<INDEX_TYPE,TAIL...>>;
-
-        Tail tail;
-
-        RAJA_INLINE
-        StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>() = default;
-       
-	 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        static constexpr INDEX_TYPE value_at(size_t index) {
-            if(index == 0){
-                return HEAD;
-            } else {
-                return Tail::value_at(index-1);
-            }
-        }
-
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        constexpr INDEX_TYPE operator[](size_t index) const {
-            if(index == 0){
-                return HEAD;
-            } else {
-                return tail[index-1];
-            }
-        }
-
-
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print_values() const {
-            printf("%ld ",(long)HEAD);
-            tail.print_values();
-        }
-
-
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-            printf("[");
-            print_values();
-            printf("]");
-        }
-
-
-    };
-
-    template<typename INDEX_TYPE>
-    struct StaticIndexArray<camp::int_seq<INDEX_TYPE>>
-    {
+template <typename INT_SEQ>
+struct StaticIndexArray;
+
+template <typename INDEX_TYPE, INDEX_TYPE NEW_HEAD, typename ARRAY>
+struct PrependStaticIndexArray;
 
-        using seq_type = camp::int_seq<INDEX_TYPE>;
+template <typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, typename ARRAY>
+struct AddStaticIndexArray;
 
-        RAJA_INLINE
-        StaticIndexArray<camp::int_seq<INDEX_TYPE>>() = default;
+template <typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, typename ARRAY>
+struct SetStaticIndexArray;
 
 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        static constexpr INDEX_TYPE value_at(size_t) {
-            return 0;
-        }
+template <typename INDEX_TYPE, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
+struct StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>
+{
 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        constexpr INDEX_TYPE operator[](size_t) const {
-            return 0;
-        }
+  using seq_type = camp::int_seq<INDEX_TYPE, HEAD, TAIL...>;
+  using Self     = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
+  using Tail     = StaticIndexArray<camp::int_seq<INDEX_TYPE, TAIL...>>;
 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print_values() const {}
+  Tail tail;
 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-            print("[]");
-        }
+  RAJA_INLINE
+  StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>() = default;
 
-    };
 
-    template<typename INDEX_TYPE, INDEX_TYPE NEW_HEAD, INDEX_TYPE... ORIG_INTS>
-    struct PrependStaticIndexArray<INDEX_TYPE, NEW_HEAD, StaticIndexArray<camp::int_seq<INDEX_TYPE,ORIG_INTS...>>>
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr INDEX_TYPE value_at(size_t index)
+  {
+    if (index == 0)
+    {
+      return HEAD;
+    }
+    else
     {
-        using Type = StaticIndexArray<camp::int_seq<INDEX_TYPE, NEW_HEAD, ORIG_INTS...>>;
-        using Seq  = camp::int_seq<INDEX_TYPE, NEW_HEAD, ORIG_INTS...>;
-    };
+      return Tail::value_at(index - 1);
+    }
+  }
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr INDEX_TYPE operator[](size_t index) const
+  {
+    if (index == 0)
+    {
+      return HEAD;
+    }
+    else
+    {
+      return tail[index - 1];
+    }
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print_values() const
+  {
+    printf("%ld ", (long)HEAD);
+    tail.print_values();
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print() const
+  {
+    printf("[");
+    print_values();
+    printf("]");
+  }
+};
+
+template <typename INDEX_TYPE>
+struct StaticIndexArray<camp::int_seq<INDEX_TYPE>>
+{
 
+  using seq_type = camp::int_seq<INDEX_TYPE>;
 
+  RAJA_INLINE
+  StaticIndexArray<camp::int_seq<INDEX_TYPE>>() = default;
 
-    template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
-    struct AddStaticIndexArray<INDEX_TYPE, IDX, DELTA, StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>> 
-    {
-        using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
-        using AddTail = typename AddStaticIndexArray<INDEX_TYPE,IDX-1,DELTA,typename Orig::Tail>::Type;
-        using Type = typename PrependStaticIndexArray<INDEX_TYPE,HEAD,AddTail>::Type;
-        using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,HEAD,AddTail>::Seq;
-    };
-
-    template<typename INDEX_TYPE, INDEX_TYPE DELTA, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
-    struct AddStaticIndexArray<INDEX_TYPE, 0, DELTA, StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>>
-    {
 
-        using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
-        using Type = typename PrependStaticIndexArray<INDEX_TYPE,HEAD+DELTA,typename Orig::Tail>::Type;
-        using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,HEAD+DELTA,typename Orig::Tail>::Seq;
-    };
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr INDEX_TYPE value_at(size_t) { return 0; }
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr INDEX_TYPE operator[](size_t) const { return 0; }
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print_values() const {}
 
-    template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE VALUE, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
-    struct SetStaticIndexArray<INDEX_TYPE, IDX, VALUE, StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>> 
-    {
-        using Orig    = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
-        using SetTail = typename SetStaticIndexArray<INDEX_TYPE,IDX-1,VALUE,typename Orig::Tail>::Type;
-        using Type    = typename PrependStaticIndexArray<INDEX_TYPE,HEAD,SetTail>::Type;
-        using Seq     = typename PrependStaticIndexArray<INDEX_TYPE,HEAD,SetTail>::Seq;
-    };
-
-    template<typename INDEX_TYPE, INDEX_TYPE VALUE, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
-    struct SetStaticIndexArray<INDEX_TYPE, 0, VALUE, StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>>
-    {
-        using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
-        using Type = typename PrependStaticIndexArray<INDEX_TYPE,VALUE,typename Orig::Tail>::Type;
-        using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,VALUE,typename Orig::Tail>::Seq;
-    };
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print() const { print("[]"); }
+};
 
+template <typename INDEX_TYPE, INDEX_TYPE NEW_HEAD, INDEX_TYPE... ORIG_INTS>
+struct PrependStaticIndexArray<
+    INDEX_TYPE,
+    NEW_HEAD,
+    StaticIndexArray<camp::int_seq<INDEX_TYPE, ORIG_INTS...>>>
+{
+  using Type =
+      StaticIndexArray<camp::int_seq<INDEX_TYPE, NEW_HEAD, ORIG_INTS...>>;
+  using Seq = camp::int_seq<INDEX_TYPE, NEW_HEAD, ORIG_INTS...>;
+};
+
+
+template <typename INDEX_TYPE,
+          size_t IDX,
+          INDEX_TYPE DELTA,
+          INDEX_TYPE HEAD,
+          INDEX_TYPE... TAIL>
+struct AddStaticIndexArray<
+    INDEX_TYPE,
+    IDX,
+    DELTA,
+    StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>>
+{
+  using Orig    = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
+  using AddTail = typename AddStaticIndexArray<INDEX_TYPE,
+                                               IDX - 1,
+                                               DELTA,
+                                               typename Orig::Tail>::Type;
+  using Type =
+      typename PrependStaticIndexArray<INDEX_TYPE, HEAD, AddTail>::Type;
+  using Seq = typename PrependStaticIndexArray<INDEX_TYPE, HEAD, AddTail>::Seq;
+};
+
+template <typename INDEX_TYPE,
+          INDEX_TYPE DELTA,
+          INDEX_TYPE HEAD,
+          INDEX_TYPE... TAIL>
+struct AddStaticIndexArray<
+    INDEX_TYPE,
+    0,
+    DELTA,
+    StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>>
+{
 
-    enum TensorTileSize
-    {
-      TENSOR_PARTIAL,  // the tile is a full TensorRegister
-      TENSOR_FULL,     // the tile is a partial TensorRegister
-      TENSOR_MULTIPLE  // the tile is multiple TennsorRegisters
-    };
+  using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
+  using Type = typename PrependStaticIndexArray<INDEX_TYPE,
+                                                HEAD + DELTA,
+                                                typename Orig::Tail>::Type;
+  using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,
+                                               HEAD + DELTA,
+                                               typename Orig::Tail>::Seq;
+};
+
+
+template <typename INDEX_TYPE,
+          size_t IDX,
+          INDEX_TYPE VALUE,
+          INDEX_TYPE HEAD,
+          INDEX_TYPE... TAIL>
+struct SetStaticIndexArray<
+    INDEX_TYPE,
+    IDX,
+    VALUE,
+    StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>>
+{
+  using Orig    = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
+  using SetTail = typename SetStaticIndexArray<INDEX_TYPE,
+                                               IDX - 1,
+                                               VALUE,
+                                               typename Orig::Tail>::Type;
+  using Type =
+      typename PrependStaticIndexArray<INDEX_TYPE, HEAD, SetTail>::Type;
+  using Seq = typename PrependStaticIndexArray<INDEX_TYPE, HEAD, SetTail>::Seq;
+};
+
+template <typename INDEX_TYPE,
+          INDEX_TYPE VALUE,
+          INDEX_TYPE HEAD,
+          INDEX_TYPE... TAIL>
+struct SetStaticIndexArray<
+    INDEX_TYPE,
+    0,
+    VALUE,
+    StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>>
+{
+  using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
+  using Type = typename PrependStaticIndexArray<INDEX_TYPE,
+                                                VALUE,
+                                                typename Orig::Tail>::Type;
+  using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,
+                                               VALUE,
+                                               typename Orig::Tail>::Seq;
+};
 
-    template<typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, camp::idx_t NUM_DIMS>
-    struct TensorTile
-    {
-        using self_type = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
-        using nonstatic_self_type = self_type;
-        using index_type = INDEX_TYPE;
-        index_type m_begin[NUM_DIMS];
-        index_type m_size[NUM_DIMS];
-
-        static constexpr camp::idx_t s_num_dims = NUM_DIMS;
-        static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
-
-
-        template<typename I, TensorTileSize S>
-        void copy(TensorTile<I, S, NUM_DIMS> const &c)
-        {
-          for(camp::idx_t i = 0;i < NUM_DIMS;++i){
-            m_begin[i] = c.m_begin[i];
-            m_size[i] = c.m_size[i];
-          }
-        }
-
-        /*!
-         * Subtract begin offsets of two tiles.
-         *
-         * The resulting tile has the sizes of the left operand, but has
-         * m_begin[i] = left.m_begin[i] - right.m_begin[i]
-         *
-         */
-        template<typename INDEX_TYPE2, TensorTileSize TENSOR_SIZE2>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type operator-(TensorTile<INDEX_TYPE2, TENSOR_SIZE2, NUM_DIMS> const &sub) const {
-          self_type result(*this);
-          for(camp::idx_t i = 0;i < s_num_dims; ++ i){
-            result.m_begin[i] -= sub.m_begin[i];
-          }
-          return result;
-        }
-
-
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-          printf("TensorTile: dims=%d, m_begin=[",  (int)NUM_DIMS);
-
-          for(camp::idx_t i = 0;i < NUM_DIMS;++ i){
-            printf("%ld ", (long)m_begin[i]);
-          }
-
-          printf("], m_size=[");
-
-          for(camp::idx_t i = 0;i < NUM_DIMS;++ i){
-            printf("%ld ", (long)m_size[i]);
-          }
-
-          printf("]\n");
-        }
-    };
-
-
-
-
-    template< typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename TBEGIN, typename TSIZE>
-    struct StaticTensorTile;
-
-    template< typename INDEX_TYPE,
-              TensorTileSize TENSOR_SIZE,
-              INDEX_TYPE... BeginInts,
-              INDEX_TYPE... SizeInts>
-    struct StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, BeginInts...>,
-              camp::int_seq<INDEX_TYPE, SizeInts...>>
-    {
 
+enum TensorTileSize
+{
+  TENSOR_PARTIAL,  // the tile is a full TensorRegister
+  TENSOR_FULL,     // the tile is a partial TensorRegister
+  TENSOR_MULTIPLE  // the tile is multiple TennsorRegisters
+};
 
+template <typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, camp::idx_t NUM_DIMS>
+struct TensorTile
+{
+  using self_type           = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
+  using nonstatic_self_type = self_type;
+  using index_type          = INDEX_TYPE;
+  index_type m_begin[NUM_DIMS];
+  index_type m_size[NUM_DIMS];
 
-        using begin_seq  = camp::int_seq<INDEX_TYPE, BeginInts...>;
-        using size_seq   = camp::int_seq<INDEX_TYPE, SizeInts... >;
-        using begin_type = StaticIndexArray<begin_seq>;
-        using size_type  = StaticIndexArray<size_seq >;
-        using self_type  = StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, begin_seq,size_seq>;
-        using index_type = INDEX_TYPE;
+  static constexpr camp::idx_t s_num_dims       = NUM_DIMS;
+  static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
 
-        using nonstatic_self_type = TensorTile<INDEX_TYPE,TENSOR_SIZE,sizeof...(BeginInts)>;
 
-        using Partial = StaticTensorTile< INDEX_TYPE, TENSOR_PARTIAL, begin_seq, size_seq>; 
-        using Full    = StaticTensorTile< INDEX_TYPE, TENSOR_FULL   , begin_seq, size_seq>; 
+  template <typename I, TensorTileSize S>
+  void copy(TensorTile<I, S, NUM_DIMS> const& c)
+  {
+    for (camp::idx_t i = 0; i < NUM_DIMS; ++i)
+    {
+      m_begin[i] = c.m_begin[i];
+      m_size[i]  = c.m_size[i];
+    }
+  }
+
+  /*!
+   * Subtract begin offsets of two tiles.
+   *
+   * The resulting tile has the sizes of the left operand, but has
+   * m_begin[i] = left.m_begin[i] - right.m_begin[i]
+   *
+   */
+  template <typename INDEX_TYPE2, TensorTileSize TENSOR_SIZE2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type
+  operator-(TensorTile<INDEX_TYPE2, TENSOR_SIZE2, NUM_DIMS> const& sub) const
+  {
+    self_type result(*this);
+    for (camp::idx_t i = 0; i < s_num_dims; ++i)
+    {
+      result.m_begin[i] -= sub.m_begin[i];
+    }
+    return result;
+  }
 
-        begin_type m_begin;
-        size_type  m_size;
 
-	static_assert(
-          sizeof...(BeginInts) == sizeof...(SizeInts),
-          "Mismatch between number of elements in Begin and Size series of StaticTensorTile"
-        );
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print() const
+  {
+    printf("TensorTile: dims=%d, m_begin=[", (int)NUM_DIMS);
 
-        static constexpr camp::idx_t s_num_dims = sizeof...(BeginInts);
-        static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
+    for (camp::idx_t i = 0; i < NUM_DIMS; ++i)
+    {
+      printf("%ld ", (long)m_begin[i]);
+    }
 
-        constexpr operator nonstatic_self_type() const {
-            return nonstatic_self_type { {BeginInts...}, {SizeInts...} };
-        }
+    printf("], m_size=[");
 
-        constexpr nonstatic_self_type nonstatic() const {
-            return *this;
-        }
-        
-        template<TensorTileSize S>
-        constexpr void copy(StaticTensorTile<INDEX_TYPE, S, begin_seq, size_seq> const RAJA_UNUSED_ARG(&c)) const
-        {}
+    for (camp::idx_t i = 0; i < NUM_DIMS; ++i)
+    {
+      printf("%ld ", (long)m_size[i]);
+    }
 
+    printf("]\n");
+  }
+};
+
+
+template <typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE>
+struct StaticTensorTile;
+
+template <typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          INDEX_TYPE... BeginInts,
+          INDEX_TYPE... SizeInts>
+struct StaticTensorTile<INDEX_TYPE,
+                        TENSOR_SIZE,
+                        camp::int_seq<INDEX_TYPE, BeginInts...>,
+                        camp::int_seq<INDEX_TYPE, SizeInts...>>
+{
 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-          printf("StaticTensorTile: dims=%d, m_begin=",  (int)s_num_dims);
 
-          m_begin.print();
+  using begin_seq  = camp::int_seq<INDEX_TYPE, BeginInts...>;
+  using size_seq   = camp::int_seq<INDEX_TYPE, SizeInts...>;
+  using begin_type = StaticIndexArray<begin_seq>;
+  using size_type  = StaticIndexArray<size_seq>;
+  using self_type =
+      StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, begin_seq, size_seq>;
+  using index_type = INDEX_TYPE;
 
-          printf(", m_size=");
-          
-          m_size.print();
+  using nonstatic_self_type =
+      TensorTile<INDEX_TYPE, TENSOR_SIZE, sizeof...(BeginInts)>;
 
-          printf("\n");
-        }
-    };
+  using Partial =
+      StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL, begin_seq, size_seq>;
+  using Full = StaticTensorTile<INDEX_TYPE, TENSOR_FULL, begin_seq, size_seq>;
 
-        template< typename TILE, typename VALUE, size_t IDX>
-        struct SetStaticTensorTileBegin;
+  begin_type m_begin;
+  size_type m_size;
 
-        template< typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename TBEGIN, typename TSIZE, INDEX_TYPE VALUE, size_t IDX > 
-        struct SetStaticTensorTileBegin<
-              StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE >,
-              camp::integral_constant<INDEX_TYPE,VALUE>,
-              IDX
-        > {
-            using BeginType = StaticIndexArray<TBEGIN>;
-            using Type = StaticTensorTile<
-                INDEX_TYPE,
-                TENSOR_SIZE,
-                typename SetStaticIndexArray<INDEX_TYPE,IDX,VALUE,BeginType>::Seq,
-                TSIZE
-            >;
-        };
+  static_assert(sizeof...(BeginInts) == sizeof...(SizeInts),
+                "Mismatch between number of elements in Begin and Size series "
+                "of StaticTensorTile");
 
-        template< typename TILE, typename VALUE, size_t IDX>
-        struct SetStaticTensorTileSize;
+  static constexpr camp::idx_t s_num_dims       = sizeof...(BeginInts);
+  static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
 
-        template< typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename TBEGIN, typename TSIZE, INDEX_TYPE VALUE, size_t IDX > 
-        struct SetStaticTensorTileSize<
-              StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE >,
-              camp::integral_constant<INDEX_TYPE,VALUE>,
-              IDX
-        > {
-            using SizeType = StaticIndexArray<TSIZE>;
-            using Type = StaticTensorTile<
-                INDEX_TYPE,
-                TENSOR_SIZE,
-                TBEGIN,
-                typename SetStaticIndexArray<INDEX_TYPE,IDX,VALUE,SizeType>::Seq
-            >;
-        };
+  constexpr operator nonstatic_self_type() const
+  {
+    return nonstatic_self_type {{BeginInts...}, {SizeInts...}};
+  }
 
+  constexpr nonstatic_self_type nonstatic() const { return *this; }
 
+  template <TensorTileSize S>
+  constexpr void copy(StaticTensorTile<INDEX_TYPE, S, begin_seq, size_seq> const
+                          RAJA_UNUSED_ARG(&c)) const
+  {}
 
 
-    template<typename POINTER_TYPE, typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, camp::idx_t NUM_DIMS, camp::idx_t STRIDE_ONE_DIM = -1>
-    struct TensorRef
-    {
-        static constexpr camp::idx_t    s_stride_one_dim = STRIDE_ONE_DIM;
-        static constexpr camp::idx_t    s_num_dims       = NUM_DIMS;
-        static constexpr TensorTileSize s_tensor_size    = TENSOR_SIZE;
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print() const
+  {
+    printf("StaticTensorTile: dims=%d, m_begin=", (int)s_num_dims);
 
-        using self_type = TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, NUM_DIMS, STRIDE_ONE_DIM>;
-        using tile_type = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
-        using pointer_type = POINTER_TYPE;
-        using index_type = INDEX_TYPE;
-        
+    m_begin.print();
 
-        pointer_type m_pointer;
-        index_type m_stride[NUM_DIMS];
-        tile_type m_tile;
+    printf(", m_size=");
 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-          printf("TensorRef: dims=%d, m_pointer=%p, m_stride=[", (int)NUM_DIMS, m_pointer);
+    m_size.print();
 
-          for(camp::idx_t i = 0;i < NUM_DIMS;++ i){
-            printf("%ld ", (long)m_stride[i]);
-          }
+    printf("\n");
+  }
+};
 
-          printf("], stride_one_dim=%d\n", (int)STRIDE_ONE_DIM);
+template <typename TILE, typename VALUE, size_t IDX>
+struct SetStaticTensorTileBegin;
 
-          m_tile.print();
-        }
+template <typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE,
+          INDEX_TYPE VALUE,
+          size_t IDX>
+struct SetStaticTensorTileBegin<
+    StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE>,
+    camp::integral_constant<INDEX_TYPE, VALUE>,
+    IDX>
+{
+  using BeginType = StaticIndexArray<TBEGIN>;
+  using Type      = StaticTensorTile<
+      INDEX_TYPE,
+      TENSOR_SIZE,
+      typename SetStaticIndexArray<INDEX_TYPE, IDX, VALUE, BeginType>::Seq,
+      TSIZE>;
+};
+
+template <typename TILE, typename VALUE, size_t IDX>
+struct SetStaticTensorTileSize;
+
+template <typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE,
+          INDEX_TYPE VALUE,
+          size_t IDX>
+struct SetStaticTensorTileSize<
+    StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE>,
+    camp::integral_constant<INDEX_TYPE, VALUE>,
+    IDX>
+{
+  using SizeType = StaticIndexArray<TSIZE>;
+  using Type     = StaticTensorTile<
+      INDEX_TYPE,
+      TENSOR_SIZE,
+      TBEGIN,
+      typename SetStaticIndexArray<INDEX_TYPE, IDX, VALUE, SizeType>::Seq>;
+};
+
+
+template <typename POINTER_TYPE,
+          typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          camp::idx_t NUM_DIMS,
+          camp::idx_t STRIDE_ONE_DIM = -1>
+struct TensorRef
+{
+  static constexpr camp::idx_t s_stride_one_dim = STRIDE_ONE_DIM;
+  static constexpr camp::idx_t s_num_dims       = NUM_DIMS;
+  static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
+
+  using self_type    = TensorRef<POINTER_TYPE,
+                              INDEX_TYPE,
+                              TENSOR_SIZE,
+                              NUM_DIMS,
+                              STRIDE_ONE_DIM>;
+  using tile_type    = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
+  using pointer_type = POINTER_TYPE;
+  using index_type   = INDEX_TYPE;
+
+
+  pointer_type m_pointer;
+  index_type m_stride[NUM_DIMS];
+  tile_type m_tile;
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print() const
+  {
+    printf("TensorRef: dims=%d, m_pointer=%p, m_stride=[", (int)NUM_DIMS,
+           m_pointer);
+
+    for (camp::idx_t i = 0; i < NUM_DIMS; ++i)
+    {
+      printf("%ld ", (long)m_stride[i]);
+    }
 
-    };
+    printf("], stride_one_dim=%d\n", (int)STRIDE_ONE_DIM);
+
+    m_tile.print();
+  }
+};
+
+
+template <typename POINTER_TYPE,
+          typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          typename STRIDE_TYPE,
+          typename BEGIN_TYPE,
+          typename SIZE_TYPE,
+          camp::idx_t STRIDE_ONE_DIM = -1>
+struct StaticTensorRef;
+
+template <typename POINTER_TYPE,
+          typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          INDEX_TYPE... StrideInts,
+          INDEX_TYPE... BeginInts,
+          INDEX_TYPE... SizeInts,
+          camp::idx_t STRIDE_ONE_DIM>
+struct StaticTensorRef<POINTER_TYPE,
+                       INDEX_TYPE,
+                       TENSOR_SIZE,
+                       camp::int_seq<INDEX_TYPE, StrideInts...>,
+                       camp::int_seq<INDEX_TYPE, BeginInts...>,
+                       camp::int_seq<INDEX_TYPE, SizeInts...>,
+                       STRIDE_ONE_DIM>
+{
 
+  static constexpr camp::idx_t s_num_dims           = sizeof...(BeginInts);
+  static constexpr camp::idx_t s_stride_one_dim     = STRIDE_ONE_DIM;
+  static constexpr TensorTileSize s_ref_tensor_size = TENSOR_SIZE;
+  using pointer_type                                = POINTER_TYPE;
+  using index_type                                  = INDEX_TYPE;
 
+  using stride_seq = camp::int_seq<INDEX_TYPE, StrideInts...>;
+  using begin_seq  = camp::int_seq<INDEX_TYPE, BeginInts...>;
+  using size_seq   = camp::int_seq<INDEX_TYPE, SizeInts...>;
 
-    template<typename POINTER_TYPE, typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename STRIDE_TYPE, typename BEGIN_TYPE, typename SIZE_TYPE, camp::idx_t STRIDE_ONE_DIM = -1>
-    struct StaticTensorRef;
+  using stride_type = StaticIndexArray<stride_seq>;
 
-    template<typename POINTER_TYPE, typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, INDEX_TYPE... StrideInts, INDEX_TYPE... BeginInts, INDEX_TYPE... SizeInts, camp::idx_t STRIDE_ONE_DIM>
-    struct StaticTensorRef<POINTER_TYPE,INDEX_TYPE,TENSOR_SIZE,camp::int_seq<INDEX_TYPE,StrideInts...>,camp::int_seq<INDEX_TYPE,BeginInts...>,camp::int_seq<INDEX_TYPE,SizeInts...>,STRIDE_ONE_DIM>
-    {
+  static_assert((sizeof...(BeginInts) == sizeof...(SizeInts)) &&
+                    (sizeof...(SizeInts) == sizeof...(StrideInts)),
+                "Mismatch between number of elements in Begin and Size series "
+                "of StaticTensorRef");
 
-        static constexpr camp::idx_t    s_num_dims         = sizeof...(BeginInts);
-        static constexpr camp::idx_t    s_stride_one_dim   = STRIDE_ONE_DIM;
-        static constexpr TensorTileSize s_ref_tensor_size  = TENSOR_SIZE;
-        using pointer_type = POINTER_TYPE;
-        using index_type = INDEX_TYPE;
-        
-        using stride_seq = camp::int_seq<INDEX_TYPE, StrideInts...>;
-        using begin_seq  = camp::int_seq<INDEX_TYPE, BeginInts...>;
-        using size_seq   = camp::int_seq<INDEX_TYPE, SizeInts... >;
 
-        using stride_type  = StaticIndexArray<stride_seq>;
+  using self_type = StaticTensorRef<POINTER_TYPE,
+                                    INDEX_TYPE,
+                                    TENSOR_SIZE,
+                                    stride_seq,
+                                    begin_seq,
+                                    size_seq>;
+  using tile_type =
+      StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, begin_seq, size_seq>;
 
-	static_assert(
-          (sizeof...(BeginInts) == sizeof...(SizeInts)) && (sizeof...(SizeInts) == sizeof...(StrideInts)),
-          "Mismatch between number of elements in Begin and Size series of StaticTensorRef"
-        );
-        
 
-        using self_type = StaticTensorRef<POINTER_TYPE,INDEX_TYPE,TENSOR_SIZE,stride_seq,begin_seq,size_seq>;
-        using tile_type = StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, begin_seq, size_seq>;
+  pointer_type m_pointer;
+  stride_type m_stride;
+  tile_type m_tile;
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print() const
+  {
+    printf("StaticTensorRef: dims=%d, m_pointer=%p, m_stride=", (int)s_num_dims,
+           m_pointer);
 
-        pointer_type m_pointer;
-        stride_type m_stride;
-        tile_type m_tile;
+    m_stride.print();
 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-          printf("StaticTensorRef: dims=%d, m_pointer=%p, m_stride=", (int)s_num_dims, m_pointer);
-
-          m_stride.print();
-
-          printf(", stride_one_dim=%d\n", (int)STRIDE_ONE_DIM);
-
-          m_tile.print();
-        }
-
-    };
-
-
-
-
-    template<typename REF_TYPE, typename TILE_TYPE, typename DIM_SEQ>
-    struct MergeRefTile;
-
-    template<typename REF_TYPE, typename TILE_TYPE, camp::idx_t ... DIM_SEQ>
-    struct MergeRefTile <REF_TYPE, TILE_TYPE, camp::idx_seq<DIM_SEQ...>> {
-
-        static_assert( REF_TYPE::s_num_dims == TILE_TYPE::s_num_dims , "Merging a ref with a tile requires an equivalent number of dimensions.");
-
-        static constexpr camp::idx_t    s_num_dims         = REF_TYPE::s_num_dims;
-        static constexpr camp::idx_t    s_stride_one_dim   = REF_TYPE::s_stride_one_dim;
-        static constexpr TensorTileSize s_ref_tensor_size  = TILE_TYPE::s_tensor_size;
-        using pointer_type    = typename REF_TYPE::pointer_type;
-        using ref_index_type  = typename REF_TYPE::index_type;
-        
-        static constexpr TensorTileSize s_tile_tensor_size = TILE_TYPE::s_tensor_size;
-        using tile_index_type = typename TILE_TYPE::index_type;
-
-        using merge_type = TensorRef<pointer_type, tile_index_type, s_tile_tensor_size, s_num_dims, s_stride_one_dim>;
-        using shift_type = merge_type;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static constexpr
-        merge_type merge(REF_TYPE const &ref, TILE_TYPE const &tile){
-          return merge_type{
-            ref.m_pointer,
-            {tile_index_type(ref.m_stride[DIM_SEQ])...},
-            tile
-          };
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static constexpr
-        shift_type shift_origin(REF_TYPE const &ref, TILE_TYPE const &tile_origin){
-          return shift_type{
-            ref.m_pointer - RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ]*ref.m_stride[DIM_SEQ]) ...),
-            {tile_index_type(ref.m_stride[DIM_SEQ])...},
-            ref.m_tile
-          };
-        }
-
-    };
-
-
-
-
-
-
-
-    template<
-       typename POINTER_TYPE, typename INDEX_TYPE1, TensorTileSize RTENSOR_SIZE,
-       typename STRIDE, INDEX_TYPE1... BEGIN1, INDEX_TYPE1... SIZE1, camp::idx_t STRIDE_ONE_DIM,
-       typename INDEX_TYPE2, TensorTileSize TENSOR_SIZE, typename BEGIN2, typename SIZE2,
-       camp::idx_t ... DIM_SEQ
-    >
-    struct MergeRefTile<
-       StaticTensorRef<
-              POINTER_TYPE, INDEX_TYPE1, RTENSOR_SIZE,
-              STRIDE,
-              camp::int_seq<INDEX_TYPE1,BEGIN1...>,
-              camp::int_seq<INDEX_TYPE1,SIZE1...>,
-              STRIDE_ONE_DIM
-       >,
-       StaticTensorTile<
-              INDEX_TYPE2,
-              TENSOR_SIZE,
-              BEGIN2,
-              SIZE2
-       >,
-       camp::idx_seq<DIM_SEQ...>
-    > {
-
-        using ref_tile_type = StaticTensorTile<
-                  INDEX_TYPE1,
-                  RTENSOR_SIZE,
-                  camp::int_seq<INDEX_TYPE1,BEGIN1...>,
-                  camp::int_seq<INDEX_TYPE1, SIZE1...>
-              >;
-
-        using ref_type = StaticTensorRef<
-                  POINTER_TYPE,
-                  INDEX_TYPE1,
-                  RTENSOR_SIZE,
-                  STRIDE,
-                  camp::int_seq<INDEX_TYPE1,BEGIN1...>,
-                  camp::int_seq<INDEX_TYPE1, SIZE1...>,
-                  STRIDE_ONE_DIM
-              >;
-
-        using tile_type = StaticTensorTile<
-                  INDEX_TYPE2,
-                  TENSOR_SIZE,
-                  BEGIN2,
-                  SIZE2
-              >;
-
-        using ref_stride_type = typename ref_type ::stride_type;
-
-        using new_stride_seq  = camp::int_seq<INDEX_TYPE2,INDEX_TYPE2(ref_stride_type::value_at(DIM_SEQ))...>; 
-        
-        using shift_begin_seq = camp::int_seq<INDEX_TYPE2,INDEX_TYPE2(BEGIN1)...>; 
-        using shift_size_seq  = camp::int_seq<INDEX_TYPE2,INDEX_TYPE2(SIZE1)...>; 
-       
-        using shift_tile_type = StaticTensorTile<INDEX_TYPE2,TENSOR_SIZE,shift_begin_seq,shift_size_seq>;
- 
-        using new_stride_type = StaticIndexArray<new_stride_seq>; 
-
-        using merge_type = StaticTensorRef<
-                  POINTER_TYPE,
-                  INDEX_TYPE2,
-                  TENSOR_SIZE,
-                  new_stride_seq,
-                  BEGIN2,
-                  SIZE2,
-                  STRIDE_ONE_DIM
-              >;
-
-        using shift_type = StaticTensorRef<
-                  POINTER_TYPE,
-                  INDEX_TYPE2,
-                  TENSOR_SIZE,
-                  new_stride_seq,
-                  shift_begin_seq,
-                  shift_size_seq,
-                  STRIDE_ONE_DIM
-              >;
-
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static constexpr
-        merge_type merge(ref_type const &ref, tile_type const &tile){
-          return merge_type {
-            ref.m_pointer,
-            new_stride_type(),
-            tile
-          };
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static constexpr
-        shift_type shift_origin(ref_type const &ref, tile_type const &tile_origin){
-          return shift_type {
-            ref.m_pointer - RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ]*ref.m_stride[DIM_SEQ]) ...),
-            new_stride_type(),
-            shift_tile_type()
-          };
-        }
-
-
-
-    };
-
-
-
-
-    template<typename REF_TYPE, typename TILE_TYPE>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    auto merge_ref_tile(REF_TYPE const &ref, TILE_TYPE const &tile) ->
-      typename MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge_type
-    {
-      return MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge(ref, tile);
-    }
+    printf(", stride_one_dim=%d\n", (int)STRIDE_ONE_DIM);
 
+    m_tile.print();
+  }
+};
 
 
-    /*!
-     * Modifies a ref's pointer so that the supplied tile_origin will resolve
-     * to the original pointer.
-     */
-    template<typename REF_TYPE, typename TILE_TYPE>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    auto shift_tile_origin(REF_TYPE const &ref, TILE_TYPE const &tile_origin) ->
-      typename MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_type
-    {
-      return MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_origin(ref, tile_origin);
-    }
+template <typename REF_TYPE, typename TILE_TYPE, typename DIM_SEQ>
+struct MergeRefTile;
 
+template <typename REF_TYPE, typename TILE_TYPE, camp::idx_t... DIM_SEQ>
+struct MergeRefTile<REF_TYPE, TILE_TYPE, camp::idx_seq<DIM_SEQ...>>
+{
 
+  static_assert(
+      REF_TYPE::s_num_dims == TILE_TYPE::s_num_dims,
+      "Merging a ref with a tile requires an equivalent number of dimensions.");
+
+  static constexpr camp::idx_t s_num_dims       = REF_TYPE::s_num_dims;
+  static constexpr camp::idx_t s_stride_one_dim = REF_TYPE::s_stride_one_dim;
+  static constexpr TensorTileSize s_ref_tensor_size = TILE_TYPE::s_tensor_size;
+  using pointer_type   = typename REF_TYPE::pointer_type;
+  using ref_index_type = typename REF_TYPE::index_type;
+
+  static constexpr TensorTileSize s_tile_tensor_size = TILE_TYPE::s_tensor_size;
+  using tile_index_type = typename TILE_TYPE::index_type;
+
+  using merge_type = TensorRef<pointer_type,
+                               tile_index_type,
+                               s_tile_tensor_size,
+                               s_num_dims,
+                               s_stride_one_dim>;
+  using shift_type = merge_type;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr merge_type merge(REF_TYPE const& ref, TILE_TYPE const& tile)
+  {
+    return merge_type {
+        ref.m_pointer, {tile_index_type(ref.m_stride[DIM_SEQ])...}, tile};
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr shift_type shift_origin(REF_TYPE const& ref,
+                                           TILE_TYPE const& tile_origin)
+  {
+    return shift_type {
+        ref.m_pointer - RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ] *
+                                                ref.m_stride[DIM_SEQ])...),
+        {tile_index_type(ref.m_stride[DIM_SEQ])...},
+        ref.m_tile};
+  }
+};
+
+
+template <typename POINTER_TYPE,
+          typename INDEX_TYPE1,
+          TensorTileSize RTENSOR_SIZE,
+          typename STRIDE,
+          INDEX_TYPE1... BEGIN1,
+          INDEX_TYPE1... SIZE1,
+          camp::idx_t STRIDE_ONE_DIM,
+          typename INDEX_TYPE2,
+          TensorTileSize TENSOR_SIZE,
+          typename BEGIN2,
+          typename SIZE2,
+          camp::idx_t... DIM_SEQ>
+struct MergeRefTile<StaticTensorRef<POINTER_TYPE,
+                                    INDEX_TYPE1,
+                                    RTENSOR_SIZE,
+                                    STRIDE,
+                                    camp::int_seq<INDEX_TYPE1, BEGIN1...>,
+                                    camp::int_seq<INDEX_TYPE1, SIZE1...>,
+                                    STRIDE_ONE_DIM>,
+                    StaticTensorTile<INDEX_TYPE2, TENSOR_SIZE, BEGIN2, SIZE2>,
+                    camp::idx_seq<DIM_SEQ...>>
+{
 
-    /*!
-     * Changes TensorTile size type to FULL
-     */
-    template<typename INDEX_TYPE, TensorTileSize RTENSOR_SIZE, camp::idx_t NUM_DIMS>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    TensorTile<INDEX_TYPE, TENSOR_FULL, NUM_DIMS> &
-    make_tensor_tile_full(TensorTile<INDEX_TYPE, RTENSOR_SIZE, NUM_DIMS> &tile){
-      return reinterpret_cast<TensorTile<INDEX_TYPE, TENSOR_FULL, NUM_DIMS> &>(tile);
-    }
+  using ref_tile_type = StaticTensorTile<INDEX_TYPE1,
+                                         RTENSOR_SIZE,
+                                         camp::int_seq<INDEX_TYPE1, BEGIN1...>,
+                                         camp::int_seq<INDEX_TYPE1, SIZE1...>>;
+
+  using ref_type = StaticTensorRef<POINTER_TYPE,
+                                   INDEX_TYPE1,
+                                   RTENSOR_SIZE,
+                                   STRIDE,
+                                   camp::int_seq<INDEX_TYPE1, BEGIN1...>,
+                                   camp::int_seq<INDEX_TYPE1, SIZE1...>,
+                                   STRIDE_ONE_DIM>;
+
+  using tile_type = StaticTensorTile<INDEX_TYPE2, TENSOR_SIZE, BEGIN2, SIZE2>;
+
+  using ref_stride_type = typename ref_type ::stride_type;
+
+  using new_stride_seq =
+      camp::int_seq<INDEX_TYPE2,
+                    INDEX_TYPE2(ref_stride_type::value_at(DIM_SEQ))...>;
+
+  using shift_begin_seq = camp::int_seq<INDEX_TYPE2, INDEX_TYPE2(BEGIN1)...>;
+  using shift_size_seq  = camp::int_seq<INDEX_TYPE2, INDEX_TYPE2(SIZE1)...>;
+
+  using shift_tile_type = StaticTensorTile<INDEX_TYPE2,
+                                           TENSOR_SIZE,
+                                           shift_begin_seq,
+                                           shift_size_seq>;
+
+  using new_stride_type = StaticIndexArray<new_stride_seq>;
+
+  using merge_type = StaticTensorRef<POINTER_TYPE,
+                                     INDEX_TYPE2,
+                                     TENSOR_SIZE,
+                                     new_stride_seq,
+                                     BEGIN2,
+                                     SIZE2,
+                                     STRIDE_ONE_DIM>;
+
+  using shift_type = StaticTensorRef<POINTER_TYPE,
+                                     INDEX_TYPE2,
+                                     TENSOR_SIZE,
+                                     new_stride_seq,
+                                     shift_begin_seq,
+                                     shift_size_seq,
+                                     STRIDE_ONE_DIM>;
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr merge_type merge(ref_type const& ref, tile_type const& tile)
+  {
+    return merge_type {ref.m_pointer, new_stride_type(), tile};
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr shift_type shift_origin(ref_type const& ref,
+                                           tile_type const& tile_origin)
+  {
+    return shift_type {
+        ref.m_pointer - RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ] *
+                                                ref.m_stride[DIM_SEQ])...),
+        new_stride_type(), shift_tile_type()};
+  }
+};
+
+
+template <typename REF_TYPE, typename TILE_TYPE>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr auto
+merge_ref_tile(REF_TYPE const& ref, TILE_TYPE const& tile) ->
+    typename MergeRefTile<
+        REF_TYPE,
+        TILE_TYPE,
+        camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge_type
+{
+  return MergeRefTile<REF_TYPE, TILE_TYPE,
+                      camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge(ref,
+                                                                          tile);
+}
 
-    /*!
-     * Changes TensorTile size type to PARTIAL
-     */
-    template<typename INDEX_TYPE, TensorTileSize RTENSOR_SIZE, camp::idx_t NUM_DIMS>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    TensorTile<INDEX_TYPE, TENSOR_PARTIAL, NUM_DIMS> &
-    make_tensor_tile_partial(TensorTile<INDEX_TYPE, RTENSOR_SIZE, NUM_DIMS> &tile){
-      return reinterpret_cast<TensorTile<INDEX_TYPE, TENSOR_PARTIAL, NUM_DIMS> &>(tile);
-    }
 
+/*!
+ * Modifies a ref's pointer so that the supplied tile_origin will resolve
+ * to the original pointer.
+ */
+template <typename REF_TYPE, typename TILE_TYPE>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr auto
+shift_tile_origin(REF_TYPE const& ref, TILE_TYPE const& tile_origin) ->
+    typename MergeRefTile<
+        REF_TYPE,
+        TILE_TYPE,
+        camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_type
+{
+  return MergeRefTile<
+      REF_TYPE, TILE_TYPE,
+      camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_origin(ref,
+                                                                 tile_origin);
+}
 
 
-    /*!
-     * Changes StaticTensorTile size type to FULL
-     */
-    template< typename INDEX_TYPE, TensorTileSize RTENSOR_SIZE, typename TBEGIN, typename TSIZE>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    StaticTensorTile<INDEX_TYPE, TENSOR_FULL, TBEGIN, TSIZE> &
-    make_tensor_tile_full(StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE> &tile){
-      return reinterpret_cast<StaticTensorTile<INDEX_TYPE, TENSOR_FULL, TBEGIN, TSIZE> &>(tile);
-    }
+/*!
+ * Changes TensorTile size type to FULL
+ */
+template <typename INDEX_TYPE,
+          TensorTileSize RTENSOR_SIZE,
+          camp::idx_t NUM_DIMS>
+RAJA_INLINE
+    RAJA_HOST_DEVICE constexpr TensorTile<INDEX_TYPE, TENSOR_FULL, NUM_DIMS>&
+    make_tensor_tile_full(TensorTile<INDEX_TYPE, RTENSOR_SIZE, NUM_DIMS>& tile)
+{
+  return reinterpret_cast<TensorTile<INDEX_TYPE, TENSOR_FULL, NUM_DIMS>&>(tile);
+}
 
-    /*!
-     * Changes StaticTensorTile size type to PARTIAL
-     */
-    template< typename INDEX_TYPE, TensorTileSize RTENSOR_SIZE, typename TBEGIN, typename TSIZE>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL, TBEGIN, TSIZE> &
-    make_tensor_tile_partial(StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE> &tile){
-      return reinterpret_cast<StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL, TBEGIN, TSIZE> &>(tile);
-    }
+/*!
+ * Changes TensorTile size type to PARTIAL
+ */
+template <typename INDEX_TYPE,
+          TensorTileSize RTENSOR_SIZE,
+          camp::idx_t NUM_DIMS>
+RAJA_INLINE
+    RAJA_HOST_DEVICE constexpr TensorTile<INDEX_TYPE, TENSOR_PARTIAL, NUM_DIMS>&
+    make_tensor_tile_partial(
+        TensorTile<INDEX_TYPE, RTENSOR_SIZE, NUM_DIMS>& tile)
+{
+  return reinterpret_cast<TensorTile<INDEX_TYPE, TENSOR_PARTIAL, NUM_DIMS>&>(
+      tile);
+}
 
 
+/*!
+ * Changes StaticTensorTile size type to FULL
+ */
+template <typename INDEX_TYPE,
+          TensorTileSize RTENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr StaticTensorTile<INDEX_TYPE,
+                                                        TENSOR_FULL,
+                                                        TBEGIN,
+                                                        TSIZE>&
+make_tensor_tile_full(
+    StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE>& tile)
+{
+  return reinterpret_cast<
+      StaticTensorTile<INDEX_TYPE, TENSOR_FULL, TBEGIN, TSIZE>&>(tile);
+}
+
+/*!
+ * Changes StaticTensorTile size type to PARTIAL
+ */
+template <typename INDEX_TYPE,
+          TensorTileSize RTENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr StaticTensorTile<INDEX_TYPE,
+                                                        TENSOR_PARTIAL,
+                                                        TBEGIN,
+                                                        TSIZE>&
+make_tensor_tile_partial(
+    StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE>& tile)
+{
+  return reinterpret_cast<
+      StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL, TBEGIN, TSIZE>&>(tile);
+}
+
 
-  } // namespace expt
-} // namespace internal
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp b/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
index d2bce598ff..0303a1f275 100644
--- a/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
@@ -34,815 +34,795 @@ namespace expt
 {
 
 
+namespace ET
+{
+class TensorExpressionConcreteBase;
+}  // namespace ET
+
+
+template <typename TENSOR, camp::idx_t DIM>
+struct TensorDimSize
+{
+  static constexpr camp::idx_t value = TENSOR::s_dim_size(DIM);
+};
+
+/*
+ * Tensor product helper class.
+ *
+ * This defines the default product operation between types when using the
+ * operator*
+ *
+ */
+template <typename LHS, typename RHS>
+struct TensorDefaultOperation
+{
+
+  using multiply_type = decltype(LHS().multiply(RHS()));
+
+  // default multiplication operator
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static multiply_type multiply(LHS const& lhs, RHS const& rhs)
+  {
+    return lhs.multiply(rhs);
+  }
+};
+
+
+template <typename REF_TYPE>
+struct TensorRegisterStoreRef
+{
+  using self_type = TensorRegisterStoreRef<REF_TYPE>;
+  REF_TYPE m_ref;
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type operator=(RHS const& rhs)
+  {
+
+    rhs.store_ref(m_ref);
+    return *this;
+  }
+};
+
+template <camp::idx_t N, camp::idx_t D>
+struct DivideRoundUp
+{
+  static constexpr camp::idx_t value = (N % D) > 0 ? (1 + N / D) : (N / D);
+};
+
+
+class TensorRegisterConcreteBase
+{};
+
+/*!
+ * TensorRegister base class that provides some default behaviors and simplifies
+ * the implementation of new register types.
+ *
+ * This uses CRTP to provide static polymorphism
+ */
+template <typename Derived>
+class TensorRegisterBase;
+
+template <typename REGISTER_POLICY,
+          typename T,
+          typename LAYOUT,
+          typename camp::idx_t... SIZES>
+class TensorRegisterBase<
+    RAJA::expt::
+        TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<SIZES...>>>
+    : public TensorRegisterConcreteBase
+{
+public:
+  using self_type = RAJA::expt::
+      TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<SIZES...>>;
+  using element_type = camp::decay<T>;
+
+  static constexpr camp::idx_t s_num_dims = sizeof...(SIZES);
+
+  static constexpr camp::idx_t s_num_registers =
+      DivideRoundUp<RAJA::product<camp::idx_t>(SIZES...),
+                    RegisterTraits<REGISTER_POLICY, T>::s_num_elem>::value;
 
+  using index_type = camp::idx_t;
 
+  using register_type = RAJA::expt::Register<T, REGISTER_POLICY>;
 
-  namespace ET
+  using register_policy = REGISTER_POLICY;
+
+private:
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type* getThis() { return static_cast<self_type*>(this); }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr self_type const* getThis() const
   {
-    class TensorExpressionConcreteBase;
-  } // namespace ET
+    return static_cast<self_type const*>(this);
+  }
+
+protected:
+  register_type m_registers[s_num_registers];
+
+public:
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr TensorRegisterBase() {}
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  TensorRegisterBase(element_type c) { broadcast(c); }
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorRegisterBase(self_type const& c) { copy(c); }
+
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  ~TensorRegisterBase() {}
 
-  template<typename TENSOR, camp::idx_t DIM>
-  struct TensorDimSize{
-      static constexpr camp::idx_t value = TENSOR::s_dim_size(DIM);
-  };
 
   /*
-   * Tensor product helper class.
-   *
-   * This defines the default product operation between types when using the
-   * operator*
-   *
+   * Overload for:    assignment of ET to a TensorRegister
    */
-  template<typename LHS, typename RHS>
-  struct TensorDefaultOperation{
+  template <typename RHS,
+            typename std::enable_if<
+                std::is_base_of<ET::TensorExpressionConcreteBase, RHS>::value,
+                bool>::type = true>
+  RAJA_INLINE RAJA_HOST_DEVICE TensorRegisterBase(RHS const& rhs)
+  {
+    // evaluate a single tile of the ET, storing in this TensorRegister
+    *this = rhs.eval(self_type::s_get_default_tile());
+  }
 
-      using multiply_type = decltype(LHS().multiply(RHS()));
 
-      // default multiplication operator
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      multiply_type multiply(LHS const &lhs, RHS const &rhs)
-      {
-        return lhs.multiply(rhs);
-      }
+  template <typename... REGS>
+  explicit RAJA_HOST_DEVICE RAJA_INLINE TensorRegisterBase(register_type reg0,
+                                                           REGS const&... regs)
+      : m_registers {reg0, regs...}
+  {
+    static_assert(1 + sizeof...(REGS) == s_num_registers,
+                  "Incompatible number of registers");
+  }
 
-  };
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr bool is_root() { return register_type::is_root(); }
 
 
-  template<typename REF_TYPE>
-  struct TensorRegisterStoreRef{
-      using self_type = TensorRegisterStoreRef<REF_TYPE>;
-      REF_TYPE m_ref;
+  template <typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE static constexpr TensorRegisterStoreRef<REF_TYPE>
+  create_et_store_ref(REF_TYPE const& ref)
+  {
+    return TensorRegisterStoreRef<REF_TYPE> {ref};
+  }
 
-      RAJA_SUPPRESS_HD_WARN
-      template<typename RHS>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator=(RHS const &rhs)
-      {
+  RAJA_SUPPRESS_HD_WARN
+  template <typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE static self_type s_load_ref(REF_TYPE const& ref)
+  {
 
-        rhs.store_ref(m_ref);
-        return *this;
-      }
-  };
+    self_type value;
 
-  template<camp::idx_t N, camp::idx_t D>
-  struct DivideRoundUp {
-      static constexpr camp::idx_t value =
-          (N % D) > 0 ? (1 + N/D) : (N/D);
-  };
+    value.load_ref(ref);
+    return value;
+  }
 
+  /*!
+   * Gets the size of the tensor
+   * Since this is a vector, just the length of the vector in dim 0
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr int s_dim_elem(int dim)
+  {
+    return (dim == 0) ? self_type::s_num_elem : 0;
+  }
 
-  class TensorRegisterConcreteBase {};
 
   /*!
-   * TensorRegister base class that provides some default behaviors and simplifies
-   * the implementation of new register types.
-   *
-   * This uses CRTP to provide static polymorphism
+   * Gets the default tile of this tensor
+   * That tile always start at 0, and extends to the full tile sizes
+   */
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr StaticTensorTile<int,
+                                    TENSOR_FULL,
+                                    camp::int_seq<int, int(SIZES * 0)...>,
+                                    camp::int_seq<int, int(SIZES)...>>
+  s_get_default_tile()
+  {
+    return StaticTensorTile<int, TENSOR_FULL,
+                            camp::int_seq<int, int(SIZES * 0)...>,
+                            camp::int_seq<int, int(SIZES)...>>();
+  }
+
+  /*!
+   * @brief convenience routine to allow Vector classes to use
+   * camp::sink() across a variety of register types, and use things like
+   * ternary operators
    */
-  template<typename Derived>
-  class TensorRegisterBase;
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr bool sink() const { return false; }
+
 
-  template<typename REGISTER_POLICY, typename T, typename LAYOUT, typename camp::idx_t ... SIZES>
-  class TensorRegisterBase<RAJA::expt::TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<SIZES...>>> :
-    public TensorRegisterConcreteBase
+  /*!
+   * Copy contents of another tensor
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& c)
   {
-    public:
-      using self_type = RAJA::expt::TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<SIZES...>>;
-      using element_type = camp::decay<T>;
-
-      static constexpr camp::idx_t s_num_dims = sizeof...(SIZES);
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      m_registers[i] = c.vec(i);
+    }
+    return *getThis();
+  }
+
+
+  /*!
+   * Sets all elements to zero
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& clear()
+  {
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      m_registers[i] = register_type(0);
+    }
+
+
+    return *getThis();
+  }
+
+
+  /*!
+   * Copy contents of another matrix operator
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type v)
+  {
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      m_registers[i].broadcast(v);
+    }
+    return *getThis();
+  }
+
+
+  /*!
+   * @brief Broadcast scalar value to first N register elements
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast_n(element_type const& value, camp::idx_t N)
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      getThis()->set(value, i);
+    }
+    return *getThis();
+  }
+
+  /*!
+   * @brief Extracts a scalar value and broadcasts to a new register
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type get_and_broadcast(int i) const
+  {
+    self_type x;
+    x.broadcast(getThis()->get(i));
+    return x;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& mat) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      result.vec(i) = m_registers[i].add(mat.vec(i));
+    }
+    return result;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& mat) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      result.vec(i) = m_registers[i].subtract(mat.vec(i));
+    }
+    return result;
+  }
+
+
+  /*!
+   * element-wise multiplication
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& x) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      result.vec(i) = m_registers[i].multiply(x.vec(i));
+    }
+    return result;
+  }
+
+  /*!
+   * element-wise fused multiply add
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply_add(self_type const& x, self_type const& add) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      result.vec(i) = m_registers[i].multiply_add(x.vec(i), add.vec(i));
+    }
+    return result;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& mat) const
+  {
+    self_type result;
+    for (camp::idx_t reg = 0; reg < s_num_registers; ++reg)
+    {
+      result.vec(reg) = m_registers[reg].divide(mat.vec(reg));
+    }
+    return result;
+  }
+
+
+  /*!
+   * @brief Dot product of two vectors
+   * @param x Other vector to dot with this vector
+   * @return Value of (*this) dot x
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type dot(self_type const& x) const
+  {
+    element_type result(0);
+
+    for (camp::idx_t reg = 0; reg < s_num_registers; ++reg)
+    {
+      result += m_registers[reg].multiply(x.vec(reg)).sum();
+    }
+
+    return result;
+  }
+
+
+  /*!
+   * @brief Set entire vector to a single scalar value
+   * @param value Value to set all vector elements to
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& operator=(element_type value)
+  {
+    getThis()->broadcast(value);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Set entire vector to a single scalar value
+   * @param value Value to set all vector elements to
+   */
+  RAJA_SUPPRESS_HD_WARN
+  template <typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type const&
+  operator=(RAJA::expt::TensorRegister<RAJA::expt::scalar_register,
+                                       T2,
+                                       RAJA::expt::ScalarLayout,
+                                       camp::idx_seq<>> const& value)
+  {
+    getThis()->broadcast(value.get(0));
+    return *getThis();
+  }
+
+  /*!
+   * @brief Assign one register to antoher
+   * @param x Vector to copy
+   * @return Value of (*this)
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& operator=(self_type const& x)
+  {
+    getThis()->copy(x);
+    return *getThis();
+  }
+
+
+  /*!
+   * @brief Add two vector registers
+   * @param x Vector to add to this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator+(self_type const& x) const { return getThis()->add(x); }
+
 
-      static constexpr camp::idx_t s_num_registers = DivideRoundUp<RAJA::product<camp::idx_t>(SIZES...), RegisterTraits<REGISTER_POLICY,T>::s_num_elem>::value;
-
-      using index_type = camp::idx_t;
-
-      using register_type = RAJA::expt::Register<T, REGISTER_POLICY>;
-
-      using register_policy = REGISTER_POLICY;
-
-    private:
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type *getThis(){
-        return static_cast<self_type *>(this);
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      self_type const *getThis() const{
-        return static_cast<self_type const *>(this);
-      }
-
-    protected:
-
-      register_type m_registers[s_num_registers];
-
-    public:
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      TensorRegisterBase(){}
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      TensorRegisterBase(element_type c)
-      {
-        broadcast(c);
-      }
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      TensorRegisterBase(self_type const &c)
-      {
-        copy(c);
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      ~TensorRegisterBase(){}
-
-
-      /*
-       * Overload for:    assignment of ET to a TensorRegister
-       */
-      template<typename RHS,
-        typename std::enable_if<std::is_base_of<ET::TensorExpressionConcreteBase, RHS>::value, bool>::type = true>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      TensorRegisterBase(RHS const &rhs)
-      {
-        // evaluate a single tile of the ET, storing in this TensorRegister
-        *this = rhs.eval(self_type::s_get_default_tile());
-      }
-
-
-      template<typename ... REGS>
-      explicit
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      TensorRegisterBase(register_type reg0, REGS const &... regs) :
-        m_registers{reg0, regs...}
-      {
-        static_assert(1+sizeof...(REGS) == s_num_registers,
-            "Incompatible number of registers");
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_root() {
-        return register_type::is_root();
-      }
-
-
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      TensorRegisterStoreRef<REF_TYPE>
-      create_et_store_ref(REF_TYPE const &ref) {
-        return TensorRegisterStoreRef<REF_TYPE>{ref};
-      }
-
-      RAJA_SUPPRESS_HD_WARN
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      self_type
-      s_load_ref(REF_TYPE const &ref) {
-
-        self_type value;
-
-        value.load_ref(ref);
-        return value;
-      }
-
-      /*!
-       * Gets the size of the tensor
-       * Since this is a vector, just the length of the vector in dim 0
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr int s_dim_elem(int dim){
-        return (dim==0) ? self_type::s_num_elem : 0;
-      }
-
-
-      /*!
-       * Gets the default tile of this tensor
-       * That tile always start at 0, and extends to the full tile sizes
-       */
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr StaticTensorTile<int, TENSOR_FULL, camp::int_seq<int,int(SIZES*0)...>, camp::int_seq<int,int(SIZES)...>>
-      s_get_default_tile()
-      {
-        return StaticTensorTile<int, TENSOR_FULL, camp::int_seq<int,int(SIZES*0)...>, camp::int_seq<int,int(SIZES)...>>();
-      }
-
-      /*!
-       * @brief convenience routine to allow Vector classes to use
-       * camp::sink() across a variety of register types, and use things like
-       * ternary operators
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      bool sink() const{
-        return false;
-      }
-
-
-
-
-
-
-      /*!
-       * Copy contents of another tensor
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &c){
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          m_registers[i] = c.vec(i);
-        }
-        return *getThis();
-      }
-
-
-
-
-      /*!
-       * Sets all elements to zero
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &clear(){
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          m_registers[i] = register_type(0);
-        }
-
-
-        return *getThis();
-      }
-
-
-      /*!
-       * Copy contents of another matrix operator
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type v){
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          m_registers[i].broadcast(v);
-        }
-        return *getThis();
-      }
-
-
-      /*!
-       * @brief Broadcast scalar value to first N register elements
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast_n(element_type const &value, camp::idx_t N){
-        for(camp::idx_t i = 0;i < N;++ i){
-          getThis()->set(value, i);
-        }
-        return *getThis();
-      }
-
-      /*!
-       * @brief Extracts a scalar value and broadcasts to a new register
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type get_and_broadcast(int i) const {
-        self_type x;
-        x.broadcast(getThis()->get(i));
-        return x;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &mat) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].add(mat.vec(i));
-        }
-        return result;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &mat) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].subtract(mat.vec(i));
-        }
-        return result;
-      }
-
-
-      /*!
-       * element-wise multiplication
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &x) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].multiply(x.vec(i));
-        }
-        return result;
-      }
-
-      /*!
-       * element-wise fused multiply add
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply_add(self_type const &x, self_type const &add) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].multiply_add(x.vec(i), add.vec(i));
-        }
-        return result;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &mat) const {
-        self_type result;
-        for(camp::idx_t reg = 0;reg < s_num_registers;++ reg){
-          result.vec(reg) = m_registers[reg].divide(mat.vec(reg));
-        }
-        return result;
-      }
-
-
-
-      /*!
-       * @brief Dot product of two vectors
-       * @param x Other vector to dot with this vector
-       * @return Value of (*this) dot x
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type dot(self_type const &x) const
-      {
-        element_type result(0);
-
-        for(camp::idx_t reg = 0;reg < s_num_registers;++ reg){
-          result += m_registers[reg].multiply(x.vec(reg)).sum();
-        }
-
-        return result;
-      }
-
-
-      /*!
-       * @brief Set entire vector to a single scalar value
-       * @param value Value to set all vector elements to
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &operator=(element_type value)
-      {
-        getThis()->broadcast(value);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Set entire vector to a single scalar value
-       * @param value Value to set all vector elements to
-       */
-      RAJA_SUPPRESS_HD_WARN
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &operator=(RAJA::expt::TensorRegister<RAJA::expt::scalar_register, T2, RAJA::expt::ScalarLayout, camp::idx_seq<>> const &value)
-      {
-        getThis()->broadcast(value.get(0));
-        return *getThis();
-      }
-
-      /*!
-       * @brief Assign one register to antoher
-       * @param x Vector to copy
-       * @return Value of (*this)
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &operator=(self_type const &x)
-      {
-        getThis()->copy(x);
-        return *getThis();
-      }
-
-
-
-
-
-      /*!
-       * @brief Add two vector registers
-       * @param x Vector to add to this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator+(self_type const &x) const
-      {
-        return getThis()->add(x);
-      }
-
-
-      /*!
-       * @brief Add a vector to this vector
-       * @param x Vector to add to this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator+=(self_type const &x)
-      {
-        *getThis() = getThis()->add(x);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Add vector to a scalar
-       * @param x scalar to add to this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator+(element_type const &x) const
-      {
-        return getThis()->add(x);
-      }
-
-
-      /*!
-       * @brief Add a scalar to this vector
-       * @param x scalar to add to this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator+=(element_type x)
-      {
-        *getThis() = getThis()->add(x);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Negate the value of this vector
-       * @return Value of -(*this)
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-() const
-      {
-        return self_type(0).subtract(*getThis());
-      }
-
-      /*!
-       * @brief Subtract two vector registers
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-(self_type const &x) const
-      {
-        return getThis()->subtract(x);
-      }
-
-      /*!
-       * @brief Subtract a vector from this vector
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator-=(self_type const &x)
-      {
-        *getThis() = getThis()->subtract(x);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Subtract scalar from this register
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-(element_type const &x) const
-      {
-        return getThis()->subtract(x);
-      }
-
-      /*!
-       * @brief Subtract a scalar from this vector
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator-=(element_type const &x)
-      {
-        *getThis() = getThis()->subtract(x);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Multiply two vector registers, element wise
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      template<typename RHS>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
+  /*!
+   * @brief Add a vector to this vector
+   * @param x Vector to add to this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator+=(self_type const& x)
+  {
+    *getThis() = getThis()->add(x);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Add vector to a scalar
+   * @param x scalar to add to this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator+(element_type const& x) const { return getThis()->add(x); }
+
+
+  /*!
+   * @brief Add a scalar to this vector
+   * @param x scalar to add to this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator+=(element_type x)
+  {
+    *getThis() = getThis()->add(x);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Negate the value of this vector
+   * @return Value of -(*this)
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator-() const { return self_type(0).subtract(*getThis()); }
+
+  /*!
+   * @brief Subtract two vector registers
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator-(self_type const& x) const
+  {
+    return getThis()->subtract(x);
+  }
+
+  /*!
+   * @brief Subtract a vector from this vector
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator-=(self_type const& x)
+  {
+    *getThis() = getThis()->subtract(x);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Subtract scalar from this register
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator-(element_type const& x) const
+  {
+    return getThis()->subtract(x);
+  }
+
+  /*!
+   * @brief Subtract a scalar from this vector
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator-=(element_type const& x)
+  {
+    *getThis() = getThis()->subtract(x);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Multiply two vector registers, element wise
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
+   */
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE
       typename TensorDefaultOperation<self_type, RHS>::multiply_type
-      operator*(RHS const &rhs) const
-      {
-        return TensorDefaultOperation<self_type, RHS>::multiply(*getThis(), rhs);
-      }
-
-      /*!
-       * @brief Multiply a vector with this vector
-       * @param x Vector to multiple with this register
-       * @return Value of (*this)+x
-       */
-      template<typename RHS>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator*=(RHS const &rhs)
-      {
-        *getThis() = TensorDefaultOperation<self_type, RHS>::multiply(*getThis(), rhs);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Divide two vector registers, element wise
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type operator/(self_type const &x) const
-      {
-        return getThis()->divide(x);
-      }
-
-      /*!
-       * @brief Divide this vector by another vector
-       * @param x Vector to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator/=(self_type const &x)
-      {
-        *getThis() = getThis()->divide(x);
-        return *getThis();
-      }
-
-
-      /*!
-       * @brief Divide by a scalar, element wise
-       * @param x Scalar to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type operator/(element_type const &x) const
-      {
-        return getThis()->divide(x);
-      }
-
-      /*!
-       * @brief Divide this vector by another vector
-       * @param x Scalar to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator/=(element_type const &x)
-      {
-        *getThis() = getThis()->divide(x);
-        return *getThis();
-      }
-
-
-      /*!
-       * @brief Returns element wise minimum value tensor
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type vmin(self_type x) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].vmin(x.vec(i));
-        }
-        return result;
-      }
-
-
-      /*!
-       * @brief Returns element wise maximum value tensor
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type vmax(self_type x) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].vmax(x.vec(i));
-        }
-        return result;
-      }
-
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      register_type &vec(int i){
-        return m_registers[i];
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      register_type const &vec(int i) const{
-        return m_registers[i];
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      register_type &get_register(int reg){
-        return m_registers[reg];
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      register_type const &get_register(int reg) const{
-        return m_registers[reg];
-      }
-
-
-
-      /*!
-       * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
-       *
-       * Derived types can override this to implement intrinsic FMS's
-       *
-       * @param b Second product operand
-       * @param c Subtraction operand
-       * @return Value of (*this)*b-c
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return getThis()->multiply_add(b, -c);
-      }
-
-      /*!
-       * Multiply this tensor by a scalar value
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type scale(element_type c) const
-      {
-        return getThis()->multiply(self_type(c));
-      }
-
-
-      /*!
-       * In-place add operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_add(self_type x){
-        *getThis() = getThis()->add(x);
-        return *getThis();
-      }
-
-      /*!
-       * In-place sbutract operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_subtract(self_type x){
-        *getThis() = getThis()->subtract(x);
-        return *getThis();
-      }
-
-      /*!
-       * In-place multiply operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_multiply(self_type x){
-        *getThis() = getThis()->multiply(x);
-        return *getThis();
-      }
-
-      /*!
-       * In-place multiply-add operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_multiply_add(self_type x, self_type y){
-        *getThis() = getThis()->multiply_add(x,y);
-        return *getThis();
-      }
-
-      /*!
-       * In-place multiply-subtract operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_multiply_subtract(self_type x, self_type y){
-        *getThis() = getThis()->multiply_subtract(x,y);
-        return *getThis();
-      }
-
-      /*!
-       * In-place divide operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_divide(self_type x){
-        *getThis() = getThis()->divide(x);
-        return *getThis();
-      }
-
-      /*!
-       * In-place scaling operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_scale(element_type x){
-        *getThis() = getThis()->scale(x);
-        return *getThis();
-      }
-
-  };
-
-} //namespace internal
-
-} // namespace expt
+      operator*(RHS const& rhs) const
+  {
+    return TensorDefaultOperation<self_type, RHS>::multiply(*getThis(), rhs);
+  }
+
+  /*!
+   * @brief Multiply a vector with this vector
+   * @param x Vector to multiple with this register
+   * @return Value of (*this)+x
+   */
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& operator*=(RHS const& rhs)
+  {
+    *getThis() =
+        TensorDefaultOperation<self_type, RHS>::multiply(*getThis(), rhs);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Divide two vector registers, element wise
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type operator/(self_type const& x) const { return getThis()->divide(x); }
+
+  /*!
+   * @brief Divide this vector by another vector
+   * @param x Vector to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator/=(self_type const& x)
+  {
+    *getThis() = getThis()->divide(x);
+    return *getThis();
+  }
+
+
+  /*!
+   * @brief Divide by a scalar, element wise
+   * @param x Scalar to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type operator/(element_type const& x) const
+  {
+    return getThis()->divide(x);
+  }
+
+  /*!
+   * @brief Divide this vector by another vector
+   * @param x Scalar to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator/=(element_type const& x)
+  {
+    *getThis() = getThis()->divide(x);
+    return *getThis();
+  }
+
+
+  /*!
+   * @brief Returns element wise minimum value tensor
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type vmin(self_type x) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      result.vec(i) = m_registers[i].vmin(x.vec(i));
+    }
+    return result;
+  }
+
+
+  /*!
+   * @brief Returns element wise maximum value tensor
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type vmax(self_type x) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      result.vec(i) = m_registers[i].vmax(x.vec(i));
+    }
+    return result;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  register_type& vec(int i) { return m_registers[i]; }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr register_type const& vec(int i) const { return m_registers[i]; }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  register_type& get_register(int reg) { return m_registers[reg]; }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr register_type const& get_register(int reg) const
+  {
+    return m_registers[reg];
+  }
 
-}  // namespace RAJA
 
+  /*!
+   * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
+   *
+   * Derived types can override this to implement intrinsic FMS's
+   *
+   * @param b Second product operand
+   * @param c Subtraction operand
+   * @return Value of (*this)*b-c
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return getThis()->multiply_add(b, -c);
+  }
+
+  /*!
+   * Multiply this tensor by a scalar value
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type scale(element_type c) const
+  {
+    return getThis()->multiply(self_type(c));
+  }
+
+
+  /*!
+   * In-place add operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type& inplace_add(self_type x)
+  {
+    *getThis() = getThis()->add(x);
+    return *getThis();
+  }
+
+  /*!
+   * In-place sbutract operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type& inplace_subtract(self_type x)
+  {
+    *getThis() = getThis()->subtract(x);
+    return *getThis();
+  }
+
+  /*!
+   * In-place multiply operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type& inplace_multiply(self_type x)
+  {
+    *getThis() = getThis()->multiply(x);
+    return *getThis();
+  }
+
+  /*!
+   * In-place multiply-add operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type& inplace_multiply_add(self_type x, self_type y)
+  {
+    *getThis() = getThis()->multiply_add(x, y);
+    return *getThis();
+  }
+
+  /*!
+   * In-place multiply-subtract operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type& inplace_multiply_subtract(self_type x, self_type y)
+  {
+    *getThis() = getThis()->multiply_subtract(x, y);
+    return *getThis();
+  }
+
+  /*!
+   * In-place divide operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type& inplace_divide(self_type x)
+  {
+    *getThis() = getThis()->divide(x);
+    return *getThis();
+  }
+
+  /*!
+   * In-place scaling operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type& inplace_scale(element_type x)
+  {
+    *getThis() = getThis()->scale(x);
+    return *getThis();
+  }
+};
+
+}  // namespace expt
+
+}  // namespace internal
+
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp b/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
index 3899a97118..9a0d011d7e 100644
--- a/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
@@ -33,345 +33,351 @@ namespace expt
 {
 
 
+template <typename STORAGE, typename DIM_SEQ, typename IDX_SEQ>
+struct StaticTensorTileExec;
 
-    template<typename STORAGE, typename DIM_SEQ, typename IDX_SEQ>
-    struct StaticTensorTileExec;
+template <typename STORAGE, typename DIM_SEQ>
+struct TensorTileExec;
 
-    template<typename STORAGE, typename DIM_SEQ>
-    struct TensorTileExec;
-
-    /**
-     * Implement a dimension tiling loop
-     */
-    template<typename STORAGE, camp::idx_t DIM0, camp::idx_t ... DIM_REST>
-    struct TensorTileExec<STORAGE, camp::idx_seq<DIM0, DIM_REST...>>{
+/**
+ * Implement a dimension tiling loop
+ */
+template <typename STORAGE, camp::idx_t DIM0, camp::idx_t... DIM_REST>
+struct TensorTileExec<STORAGE, camp::idx_seq<DIM0, DIM_REST...>>
+{
 
-      using inner_t = TensorTileExec<STORAGE, camp::idx_seq<DIM_REST...>>;
+  using inner_t = TensorTileExec<STORAGE, camp::idx_seq<DIM_REST...>>;
 
-      template<typename OTILE, typename TTYPE, typename BODY>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      void exec(OTILE const &otile, TTYPE &tile, BODY && body){
+  template <typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void
+  exec(OTILE const& otile, TTYPE& tile, BODY&& body)
+  {
 
-        auto const orig_begin = otile.m_begin[DIM0];
-        auto const orig_size =  otile.m_size[DIM0];
+    auto const orig_begin = otile.m_begin[DIM0];
+    auto const orig_size  = otile.m_size[DIM0];
 
-        // Do the full tile sizes
-        for(tile.m_begin[DIM0] = orig_begin;
+    // Do the full tile sizes
+    for (tile.m_begin[DIM0] = orig_begin;
 
-            tile.m_begin[DIM0] +  STORAGE::s_dim_elem(DIM0) <=
-                orig_begin+orig_size;
+         tile.m_begin[DIM0] + STORAGE::s_dim_elem(DIM0) <=
+         orig_begin + orig_size;
 
-            tile.m_begin[DIM0] += STORAGE::s_dim_elem(DIM0)){
+         tile.m_begin[DIM0] += STORAGE::s_dim_elem(DIM0))
+    {
 
-          // Do the next inner tiling loop
-          inner_t::exec(otile, tile, body);
+      // Do the next inner tiling loop
+      inner_t::exec(otile, tile, body);
+    }
 
-        }
+    // Postamble if needed
+    if (tile.m_begin[DIM0] < orig_begin + orig_size)
+    {
 
-        // Postamble if needed
-        if(tile.m_begin[DIM0] <
-            orig_begin + orig_size)
-        {
+      // convert tile to a partial tile
+      auto& part_tile = make_tensor_tile_partial(tile);
 
-          // convert tile to a partial tile
-          auto &part_tile = make_tensor_tile_partial(tile);
+      // store original size
+      auto tmp_size = part_tile.m_size[DIM0];
 
-          // store original size
-          auto tmp_size = part_tile.m_size[DIM0];
+      // set tile size to the remainder
+      part_tile.m_size[DIM0] = orig_begin + orig_size - tile.m_begin[DIM0];
 
-          // set tile size to the remainder
-          part_tile.m_size[DIM0] =
-              orig_begin +
-              orig_size -
-              tile.m_begin[DIM0];
+      // Do the next inner tiling loop
+      inner_t::exec(otile, part_tile, body);
 
-          // Do the next inner tiling loop
-          inner_t::exec(otile, part_tile, body);
+      // restore size
+      part_tile.m_size[DIM0] = tmp_size;
+    }
 
-          // restore size
-          part_tile.m_size[DIM0] = tmp_size;
-        }
+    // reset tile dimension
+    tile.m_begin[DIM0] = orig_begin;
+  }
 
-        // reset tile dimension
-        tile.m_begin[DIM0] = orig_begin;
 
-      }
+  template <typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void
+  static_exec(OTILE const& otile, TTYPE const& tile, BODY&& body)
+  {
 
 
+    auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
+    auto constexpr orig_size  = OTILE::size_type::value_at(DIM0);
 
-      template<
-          typename OTILE,
-          typename TTYPE,
-          typename BODY
-      >
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      void
-      static_exec(
-          OTILE const &otile,
-          TTYPE const &tile,
-          BODY && body
-      ){
+    auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
 
+    auto constexpr step_size = STORAGE::s_dim_elem(DIM0);
 
-        auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
-        auto constexpr orig_size =  OTILE:: size_type::value_at(DIM0);
+    auto constexpr iter_count =
+        (tile_begin >= orig_begin) && (tile_begin < (orig_begin + orig_size))
+            ? ((orig_begin + orig_size) - tile_begin + step_size - 1) /
+                  step_size
+            : 0;
 
-        auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
 
-        auto constexpr step_size  = STORAGE::s_dim_elem(DIM0);
+    using IterCount =
+        camp::integral_constant<typename TTYPE::index_type, iter_count>;
+    using DimSeq = camp::idx_seq<DIM0, DIM_REST...>;
+    using IdxSeq = typename camp::detail::gen_seq<typename TTYPE::index_type,
+                                                  IterCount>::type;
 
-        auto constexpr iter_count =
-               (tile_begin >= orig_begin) && (tile_begin < (orig_begin+orig_size))
-                 ? ((orig_begin + orig_size) - tile_begin + step_size - 1) / step_size
-                 : 0;
+    StaticTensorTileExec<STORAGE, DimSeq, IdxSeq>::exec(otile, tile, body);
+  }
+};
 
 
-        using IterCount = camp::integral_constant<typename TTYPE::index_type,iter_count>;
-        using DimSeq = camp::idx_seq<DIM0,DIM_REST...>;
-        using IdxSeq = typename camp::detail::gen_seq<typename TTYPE::index_type,IterCount>::type;
+/**
+ * Termination of nested loop:  execute evaluation of ET
+ */
+template <typename STORAGE>
+struct TensorTileExec<STORAGE, camp::idx_seq<>>
+{
 
-        StaticTensorTileExec<STORAGE,DimSeq,IdxSeq>::exec(otile,tile,body);
-        
-      }
+  template <typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void
+  exec(OTILE&, TTYPE const& tile, BODY&& body)
+  {
+
+    // execute body, passing in the current tile
+    body(tile);
+  }
+
+  template <typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void
+  static_exec(OTILE const&, TTYPE const& tile, BODY&& body)
+  {
+
+    // execute body, passing in the current tile
+    body(tile);
+  }
+};
+
+
+template <typename STORAGE,
+          typename TILE_TYPE,
+          typename BODY,
+          camp::idx_t... IDX_SEQ,
+          camp::idx_t... DIM_SEQ>
+RAJA_INLINE RAJA_HOST_DEVICE void
+tensorTileExec_expanded(TILE_TYPE const& orig_tile,
+                        BODY&& body,
+                        camp::idx_seq<IDX_SEQ...> const&,
+                        camp::idx_seq<DIM_SEQ...> const&)
+{
 
+  // tile over full rows and columns
+  // tile_type tile{{0,0},{row_tile_size, col_tile_size}};
+  TILE_TYPE tile {
+      {orig_tile.m_begin[IDX_SEQ]...},
+      {STORAGE::s_dim_elem(IDX_SEQ)...},
+  };
 
 
-    };
+  // Promote the tile type to a "full-tile" so that the full-element
+  // register operations are used.
+  // Any of the tiling loops can demote this to a partial-tile when
+  // they do postamble execution
+  auto& full_tile = make_tensor_tile_full(tile);
 
+  // Do all of the tiling loops in layout order, this may improve
+  // cache performance
+  using layout_order       = typename STORAGE::layout_type::seq_t;
+  using tensor_tile_exec_t = TensorTileExec<STORAGE, layout_order>;
 
-    /**
-     * Termination of nested loop:  execute evaluation of ET
-     */
-    template<typename STORAGE>
-    struct TensorTileExec<STORAGE, camp::idx_seq<>>{
 
-      template<typename OTILE, typename TTYPE, typename BODY>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      void exec(OTILE &, TTYPE const &tile, BODY && body){
+  tensor_tile_exec_t::exec(orig_tile, full_tile, body);
+}
 
-        // execute body, passing in the current tile
-        body(tile);
 
-      }
+template <typename STORAGE, typename DIM_SEQ, typename IDX_SEQ>
+struct StaticTensorTileExec;
 
-      template<typename OTILE, typename TTYPE, typename BODY>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      void static_exec(OTILE const &, TTYPE const &tile, BODY && body){
+/**
+ * Implement a dimension tiling loop
+ */
 
-        // execute body, passing in the current tile
-        body(tile);
+template <typename STORAGE,
+          camp::idx_t DIM0,
+          camp::idx_t... DIM_REST,
+          camp::idx_t IDX,
+          camp::idx_t... IDX_REST>
+struct StaticTensorTileExec<STORAGE,
+                            camp::idx_seq<DIM0, DIM_REST...>,
+                            camp::idx_seq<IDX, IDX_REST...>>
+{
 
-      }
+  using DimList = camp::idx_seq<DIM0, DIM_REST...>;
+  using DimTail = camp::idx_seq<DIM_REST...>;
+  using IdxList = camp::idx_seq<IDX, IDX_REST...>;
+  using IdxTail = camp::idx_seq<IDX_REST...>;
 
-    };
+  using DownExec = TensorTileExec<STORAGE, camp::idx_seq<DIM_REST...>>;
+  using NextExec = StaticTensorTileExec<STORAGE,
+                                        camp::idx_seq<DIM0, DIM_REST...>,
+                                        camp::idx_seq<IDX_REST...>>;
 
+  static auto const step_size = STORAGE::s_dim_elem(DIM0);
 
+  template <typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void
+  exec(OTILE const& otile, TTYPE const& tile, BODY&& body)
+  {
 
-    template<typename STORAGE, typename TILE_TYPE, typename BODY, camp::idx_t ... IDX_SEQ, camp::idx_t ... DIM_SEQ>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    void tensorTileExec_expanded(TILE_TYPE const &orig_tile, BODY && body, camp::idx_seq<IDX_SEQ...> const &, camp::idx_seq<DIM_SEQ...> const &)
-    {
+    auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
+    auto constexpr orig_size  = OTILE::size_type::value_at(DIM0);
 
-      // tile over full rows and columns
-      // tile_type tile{{0,0},{row_tile_size, col_tile_size}};
-      TILE_TYPE tile {
-        {orig_tile.m_begin[IDX_SEQ]...},
-        {STORAGE::s_dim_elem(IDX_SEQ)...},
-      };
+    auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
 
+    using NextBegin =
+        camp::integral_constant<typename TTYPE::index_type,
+                                tile_begin + STORAGE::s_dim_elem(DIM0)>;
+    using TailSize =
+        camp::integral_constant<typename TTYPE::index_type,
+                                (orig_begin + orig_size) - tile_begin>;
 
-      // Promote the tile type to a "full-tile" so that the full-element
-      // register operations are used.
-      // Any of the tiling loops can demote this to a partial-tile when
-      // they do postamble execution
-      auto &full_tile = make_tensor_tile_full(tile);
+    using NextTile =
+        typename expt::SetStaticTensorTileBegin<TTYPE, NextBegin,
+                                                (size_t)DIM0>::Type;
 
-      // Do all of the tiling loops in layout order, this may improve
-      // cache performance
-      using layout_order = typename STORAGE::layout_type::seq_t;
-      using tensor_tile_exec_t =
-             TensorTileExec<STORAGE, layout_order>;
+    using TailTile = typename expt::SetStaticTensorTileSize<TTYPE, TailSize,
+                                                            (size_t)DIM0>::Type;
+    using PartTile = typename TailTile::Partial;
 
 
-      tensor_tile_exec_t::exec(orig_tile, full_tile, body);
+    static_assert((tile_begin + STORAGE::s_dim_elem(DIM0)) <=
+                      (orig_begin + orig_size + STORAGE::s_dim_elem(DIM0)),
+                  "OOB StaticTensorTileExec DOWN");
 
+    if ((tile_begin + STORAGE::s_dim_elem(DIM0)) <= (orig_begin + orig_size))
+    {
+      DownExec::static_exec(otile, tile, body);
+      NextTile next_tile;
+      NextExec::exec(otile, next_tile, body);
     }
+    else if (tile_begin < (orig_begin + orig_size))
+    {
+      PartTile part_tile;
+      DownExec::static_exec(otile, part_tile, body);
+    }
+  }
+};
 
 
-    template<typename STORAGE, typename DIM_SEQ, typename IDX_SEQ>
-    struct StaticTensorTileExec;
-
-    /**
-     * Implement a dimension tiling loop
-     */
-
-    template<typename STORAGE, camp::idx_t DIM0, camp::idx_t ... DIM_REST, camp::idx_t IDX, camp::idx_t ... IDX_REST>
-    struct StaticTensorTileExec<STORAGE, camp::idx_seq<DIM0, DIM_REST...>,camp::idx_seq<IDX,IDX_REST...>>{
-
-          using DimList  = camp::idx_seq<DIM0, DIM_REST...>;
-          using DimTail  = camp::idx_seq<      DIM_REST...>;
-          using IdxList  = camp::idx_seq<IDX , IDX_REST...>;
-          using IdxTail  = camp::idx_seq<      IDX_REST...>;
-
-          using DownExec = TensorTileExec<STORAGE,camp::idx_seq<DIM_REST...>>;
-          using NextExec = StaticTensorTileExec<STORAGE,camp::idx_seq<DIM0,DIM_REST...>,camp::idx_seq<IDX_REST...>>;
-
-          static auto const step_size = STORAGE::s_dim_elem(DIM0);
-
-          template<
-              typename OTILE,
-              typename TTYPE,
-              typename BODY
-          >
-          RAJA_HOST_DEVICE
-          RAJA_INLINE
-          static
-          void
-          exec(
-              OTILE const &otile,
-              TTYPE const &tile,
-              BODY && body
-          ){
-    
-            auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
-            auto constexpr orig_size =  OTILE:: size_type::value_at(DIM0);
-    
-            auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
-
-            using NextBegin = camp::integral_constant<typename TTYPE::index_type,tile_begin+STORAGE::s_dim_elem(DIM0)>;
-            using TailSize  = camp::integral_constant<typename TTYPE::index_type,(orig_begin+orig_size)-tile_begin>;
-
-            using NextTile  = typename expt::SetStaticTensorTileBegin<TTYPE,NextBegin,(size_t)DIM0>::Type;
-
-            using TailTile  = typename expt::SetStaticTensorTileSize <TTYPE,TailSize ,(size_t)DIM0>::Type;
-            using PartTile  = typename TailTile::Partial;
-
-    
-            static_assert( (tile_begin + STORAGE::s_dim_elem(DIM0) ) <= (orig_begin + orig_size+ STORAGE::s_dim_elem(DIM0) ), "OOB StaticTensorTileExec DOWN" );
-     
-            if( (tile_begin + STORAGE::s_dim_elem(DIM0) ) <= (orig_begin + orig_size) ){
-               DownExec::static_exec(otile, tile, body);
-               NextTile next_tile;
-               NextExec::exec(otile, next_tile, body);
-            } else if ( tile_begin < (orig_begin + orig_size ) ) {
-               PartTile part_tile;
-               DownExec::static_exec(otile,part_tile,body);
-            }
-    
-          }
-
-
-
-    };
-
-
+template <typename STORAGE,
+          camp::idx_t DIM0,
+          camp::idx_t IDX,
+          camp::idx_t... IDX_REST>
+struct StaticTensorTileExec<STORAGE,
+                            camp::idx_seq<DIM0>,
+                            camp::idx_seq<IDX, IDX_REST...>>
+{
+  using NextExec = StaticTensorTileExec<STORAGE,
+                                        camp::idx_seq<DIM0>,
+                                        camp::idx_seq<IDX_REST...>>;
 
-    template<typename STORAGE, camp::idx_t DIM0, camp::idx_t IDX, camp::idx_t ... IDX_REST>
-    struct StaticTensorTileExec<STORAGE, camp::idx_seq<DIM0>, camp::idx_seq<IDX,IDX_REST...>>{
-      using NextExec = StaticTensorTileExec<STORAGE,camp::idx_seq<DIM0>,camp::idx_seq<IDX_REST...>>;
 
+  template <typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void
+  exec(OTILE const& otile, TTYPE const& tile, BODY&& body)
+  {
+    auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
+    auto constexpr orig_size  = OTILE::size_type::value_at(DIM0);
 
-      template<typename OTILE, typename TTYPE, typename BODY>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static void exec(OTILE const & otile, TTYPE const &tile, BODY && body) {
-            auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
-            auto constexpr orig_size =  OTILE:: size_type::value_at(DIM0);
-    
-            auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
+    auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
 
-            using NextBegin = camp::integral_constant<typename TTYPE::index_type,tile_begin+STORAGE::s_dim_elem(DIM0)>;
-            using TailSize  = camp::integral_constant<typename TTYPE::index_type,(orig_begin+orig_size)-tile_begin>;
+    using NextBegin =
+        camp::integral_constant<typename TTYPE::index_type,
+                                tile_begin + STORAGE::s_dim_elem(DIM0)>;
+    using TailSize =
+        camp::integral_constant<typename TTYPE::index_type,
+                                (orig_begin + orig_size) - tile_begin>;
 
-            using NextTile  = typename expt::SetStaticTensorTileBegin<TTYPE,NextBegin,(size_t)DIM0>::Type;
+    using NextTile =
+        typename expt::SetStaticTensorTileBegin<TTYPE, NextBegin,
+                                                (size_t)DIM0>::Type;
 
-            using TailTile  = typename expt::SetStaticTensorTileSize <TTYPE,TailSize ,(size_t)DIM0>::Type;
-            using PartTile  = typename TailTile::Partial;
+    using TailTile = typename expt::SetStaticTensorTileSize<TTYPE, TailSize,
+                                                            (size_t)DIM0>::Type;
+    using PartTile = typename TailTile::Partial;
 
-    
-            static_assert( (tile_begin + STORAGE::s_dim_elem(DIM0) ) <= (orig_begin + orig_size+ STORAGE::s_dim_elem(DIM0) ), "OOB StaticTensorTileExec ACROSS" );
-     
-            if( (tile_begin + STORAGE::s_dim_elem(DIM0) ) <= (orig_begin + orig_size) ){
-               body(tile);
-               NextTile next_tile;
-               NextExec::exec(otile, next_tile, body);
-            } else if ( tile_begin < (orig_begin + orig_size ) ) {
-               PartTile part_tile;
-               body(part_tile);
-            }
-      }
 
-    };
-
-    template<typename STORAGE, camp::idx_t ... DIM_REST>
-    struct StaticTensorTileExec<STORAGE, camp::idx_seq<DIM_REST...>, camp::idx_seq<> >{
-
-      template<typename OTILE, typename TTYPE, typename BODY>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static void exec(OTILE const &, TTYPE const &, BODY &&) {}
-
-    };
+    static_assert((tile_begin + STORAGE::s_dim_elem(DIM0)) <=
+                      (orig_begin + orig_size + STORAGE::s_dim_elem(DIM0)),
+                  "OOB StaticTensorTileExec ACROSS");
 
-
-
-    template<typename STORAGE, typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename TBEGIN, typename TSIZE, typename BODY, camp::idx_t ... IDX_SEQ, camp::idx_t ... DIM_SEQ>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    void tensorTileExec_expanded( StaticTensorTile<INDEX_TYPE,TENSOR_SIZE, TBEGIN, TSIZE> const &orig_tile, BODY && body, camp::idx_seq<IDX_SEQ...> const &, camp::idx_seq<DIM_SEQ...> const &)
+    if ((tile_begin + STORAGE::s_dim_elem(DIM0)) <= (orig_begin + orig_size))
     {
+      body(tile);
+      NextTile next_tile;
+      NextExec::exec(otile, next_tile, body);
+    }
+    else if (tile_begin < (orig_begin + orig_size))
+    {
+      PartTile part_tile;
+      body(part_tile);
+    }
+  }
+};
 
-      using InputType = StaticTensorTile<
-          INDEX_TYPE,
-          TENSOR_SIZE,
-          TBEGIN,
-          TSIZE
-      >;
+template <typename STORAGE, camp::idx_t... DIM_REST>
+struct StaticTensorTileExec<STORAGE,
+                            camp::idx_seq<DIM_REST...>,
+                            camp::idx_seq<>>
+{
 
-      using InputBegin = typename InputType::begin_type;
+  template <typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void
+  exec(OTILE const&, TTYPE const&, BODY&&)
+  {}
+};
+
+
+template <typename STORAGE,
+          typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE,
+          typename BODY,
+          camp::idx_t... IDX_SEQ,
+          camp::idx_t... DIM_SEQ>
+RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec_expanded(
+    StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE> const& orig_tile,
+    BODY&& body,
+    camp::idx_seq<IDX_SEQ...> const&,
+    camp::idx_seq<DIM_SEQ...> const&)
+{
 
-      using Type = StaticTensorTile<
-          INDEX_TYPE,
-          TENSOR_FULL,
-          camp::int_seq<INDEX_TYPE,InputBegin::value_at(IDX_SEQ)...>,
-          camp::int_seq<INDEX_TYPE,STORAGE::s_dim_elem(IDX_SEQ)...>
-      >;
+  using InputType = StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE>;
 
-      Type full_tile;
+  using InputBegin = typename InputType::begin_type;
 
-      // Do all of the tiling loops in layout order, this may improve
-      // cache performance
-      using layout_order = typename STORAGE::layout_type::seq_t;
-      using tensor_tile_exec_t =
-             TensorTileExec<STORAGE, layout_order>;
+  using Type = StaticTensorTile<
+      INDEX_TYPE, TENSOR_FULL,
+      camp::int_seq<INDEX_TYPE, InputBegin::value_at(IDX_SEQ)...>,
+      camp::int_seq<INDEX_TYPE, STORAGE::s_dim_elem(IDX_SEQ)...>>;
 
+  Type full_tile;
 
-      tensor_tile_exec_t::static_exec(orig_tile, full_tile, body);
+  // Do all of the tiling loops in layout order, this may improve
+  // cache performance
+  using layout_order       = typename STORAGE::layout_type::seq_t;
+  using tensor_tile_exec_t = TensorTileExec<STORAGE, layout_order>;
 
-    }
 
+  tensor_tile_exec_t::static_exec(orig_tile, full_tile, body);
+}
 
 
-    template<typename STORAGE, typename TILE_TYPE, typename BODY>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    void tensorTileExec(TILE_TYPE const &tile, BODY && body)
-    {
-      using layout_type = typename STORAGE::layout_type;
-      tensorTileExec_expanded<STORAGE>(tile, body, camp::make_idx_seq_t<STORAGE::s_num_dims>{}, layout_type{});
-    }
+template <typename STORAGE, typename TILE_TYPE, typename BODY>
+RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec(TILE_TYPE const& tile,
+                                                 BODY&& body)
+{
+  using layout_type = typename STORAGE::layout_type;
+  tensorTileExec_expanded<STORAGE>(
+      tile, body, camp::make_idx_seq_t<STORAGE::s_num_dims> {}, layout_type {});
+}
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp b/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
index 4ef4998fbe..dfce569070 100644
--- a/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
+++ b/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
@@ -34,958 +34,1064 @@ namespace RAJA
 namespace expt
 {
 
-  /*!
-   * This provides a Tensor specialization for vectors
-   */
-  template<typename REGISTER_POLICY, typename T, camp::idx_t SIZE>
-  class TensorRegister<REGISTER_POLICY, T, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>> :
-    public internal::expt::TensorRegisterBase<RAJA::expt::TensorRegister<REGISTER_POLICY, T, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>>>
+/*!
+ * This provides a Tensor specialization for vectors
+ */
+template <typename REGISTER_POLICY, typename T, camp::idx_t SIZE>
+class TensorRegister<REGISTER_POLICY,
+                     T,
+                     RAJA::expt::VectorLayout,
+                     camp::idx_seq<SIZE>>
+    : public internal::expt::TensorRegisterBase<
+          RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                     T,
+                                     RAJA::expt::VectorLayout,
+                                     camp::idx_seq<SIZE>>>
+{
+public:
+  using self_type = TensorRegister<REGISTER_POLICY,
+                                   T,
+                                   RAJA::expt::VectorLayout,
+                                   camp::idx_seq<SIZE>>;
+  using base_type = internal::expt::TensorRegisterBase<
+      RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                 T,
+                                 RAJA::expt::VectorLayout,
+                                 camp::idx_seq<SIZE>>>;
+  using element_type  = camp::decay<T>;
+  using layout_type   = TensorLayout<0>;
+  using register_type = Register<T, REGISTER_POLICY>;
+
+  static constexpr camp::idx_t s_num_elem = SIZE;
+
+  using int_element_type =
+      typename register_type::int_vector_type::element_type;
+  using int_vector_type = TensorRegister<REGISTER_POLICY,
+                                         int_element_type,
+                                         RAJA::expt::VectorLayout,
+                                         camp::idx_seq<SIZE>>;
+
+private:
+  static constexpr camp::idx_t s_register_num_elem = register_type::s_num_elem;
+
+  static constexpr camp::idx_t s_num_full_registers =
+      s_num_elem / s_register_num_elem;
+
+  static constexpr camp::idx_t s_num_partial_lanes =
+      s_num_elem % s_register_num_elem;
+
+  static constexpr camp::idx_t s_num_registers = (s_num_partial_lanes > 0)
+                                                     ? s_num_full_registers + 1
+                                                     : s_num_full_registers;
+
+  using log_base2_t = RAJA::LogBase2<s_register_num_elem>;
+
+  static constexpr camp::idx_t s_shift_per_register = log_base2_t::value;
+
+  static constexpr camp::idx_t s_mask_per_register =
+      (1 << log_base2_t::value) - 1;
+
+  // Offset of last regiser in m_registers
+  static constexpr camp::idx_t s_final_register = s_num_partial_lanes == 0
+                                                      ? s_num_full_registers - 1
+                                                      : s_num_full_registers;
+
+  template <typename IDX>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto to_register(IDX i) -> IDX
   {
-    public:
-      using self_type = TensorRegister<REGISTER_POLICY, T, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>>;
-      using base_type = internal::expt::TensorRegisterBase<RAJA::expt::TensorRegister<REGISTER_POLICY, T, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>>>;
-      using element_type = camp::decay<T>;
-      using layout_type = TensorLayout<0>;
-      using register_type = Register<T, REGISTER_POLICY>;
-
-      static constexpr camp::idx_t s_num_elem = SIZE;
-
-      using int_element_type = typename register_type::int_vector_type::element_type;
-      using int_vector_type = TensorRegister<REGISTER_POLICY, int_element_type, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>>;
-
-    private:
+    return i >> IDX(s_shift_per_register);
+  }
 
-      static constexpr camp::idx_t s_register_num_elem = register_type::s_num_elem;
-
-      static constexpr camp::idx_t s_num_full_registers = s_num_elem/s_register_num_elem;
+  template <typename IDX>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto to_lane(IDX i) -> IDX
+  {
+    return i & IDX(s_mask_per_register);
+  }
 
-      static constexpr camp::idx_t s_num_partial_lanes =  s_num_elem%s_register_num_elem;
 
-      static constexpr camp::idx_t s_num_registers =
-          (s_num_partial_lanes > 0) ?
-              s_num_full_registers + 1 :
-              s_num_full_registers;
+  using base_type::m_registers;
 
-      using log_base2_t = RAJA::LogBase2<s_register_num_elem>;
+public:
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr TensorRegister() {}
 
-      static constexpr camp::idx_t s_shift_per_register =
-          log_base2_t::value;
 
-      static constexpr camp::idx_t s_mask_per_register =
-          (1<<log_base2_t::value)-1;
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  TensorRegister(element_type c) { this->broadcast(c); }
 
-      // Offset of last regiser in m_registers
-      static constexpr camp::idx_t s_final_register =
-          s_num_partial_lanes == 0 ?
-              s_num_full_registers-1 : s_num_full_registers;
 
-      template<typename IDX>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      static
-      auto to_register(IDX i) -> IDX {
-        return i >> IDX(s_shift_per_register);
-      }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorRegister(self_type const& c) : base_type(c) {}
 
-      template<typename IDX>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      static
-      auto to_lane(IDX i) -> IDX {
-        return i & IDX(s_mask_per_register);
-      }
+  /*
+   * Overload for:    assignment of ET to a RAJA::expt::TensorRegister
+   */
+  template <typename RHS,
+            typename std::enable_if<
+                std::is_base_of<
+                    RAJA::internal::expt::ET::TensorExpressionConcreteBase,
+                    RHS>::value,
+                bool>::type = true>
+  RAJA_INLINE RAJA_HOST_DEVICE TensorRegister(RHS const& rhs)
+  {
+    // evaluate a single tile of the ET, storing in this
+    // RAJA::expt::TensorRegister
+    *this = rhs.eval(base_type::s_get_default_tile());
+  }
 
 
-      using base_type::m_registers;
+  template <typename... REGS>
+  explicit RAJA_HOST_DEVICE RAJA_INLINE TensorRegister(register_type reg0,
+                                                       REGS const&... regs)
+      : base_type(reg0, regs...)
+  {}
 
-    public:
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr bool is_root() { return register_type::is_root(); }
 
 
+  /*!
+   * Returns true if the underlying data packed for a given tensor ref
+   *
+   * This is true if either:
+   *   It's column major and the rows are stride one
+   *   It's row major and the columns are stride one
+   */
+  template <camp::idx_t STRIDE_ONE_DIM>
+  RAJA_HOST_DEVICE RAJA_INLINE static constexpr bool is_ref_packed()
+  {
+    return STRIDE_ONE_DIM == 0;
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      TensorRegister(){}
 
+  /*!
+   * Gets the maximum size of matrix along specified dimension
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr camp::idx_t s_dim_elem(camp::idx_t dim)
+  {
+    return dim == 0 ? s_num_elem : 0;
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      TensorRegister(element_type c)
-      {
-        this->broadcast(c);
-      }
 
+  /*!
+   * @brief Set entire vector to a single scalar value
+   * @param value Value to set all vector elements to
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator=(element_type value)
+  {
+    this->broadcast(value);
+    return *this;
+  }
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      TensorRegister(self_type const &c) :
-        base_type(c)
-      {
-      }
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator=(self_type const& c) { return this->copy(c); }
 
-      /*
-       * Overload for:    assignment of ET to a RAJA::expt::TensorRegister
-       */
-      template<typename RHS,
-        typename std::enable_if<std::is_base_of<RAJA::internal::expt::ET::TensorExpressionConcreteBase, RHS>::value, bool>::type = true>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      TensorRegister(RHS const &rhs)
-      {
-        // evaluate a single tile of the ET, storing in this RAJA::expt::TensorRegister
-        *this = rhs.eval(base_type::s_get_default_tile());
-      }
+  /*!
+   * Provide left vector-matrix multiply for operator* between
+   * this vector and a matrix
+   */
+  template <typename T2, typename L, typename RP>
+  self_type operator*(SquareMatrixRegister<T2, L, RP> const& y) const
+  {
+    return y.left_vector_multiply(*this);
+  }
 
 
-      template<typename ... REGS>
-      explicit
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      TensorRegister(register_type reg0, REGS const &... regs) :
-        base_type(reg0, regs...)
-      {
-      }
+  template <typename REF_TYPE>
+  struct RefBridge;
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_root() {
-        return register_type::is_root();
-      }
 
+  template <typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& load_ref(REF_TYPE const& ref)
+  {
+    RefBridge<REF_TYPE>::load_ref(*this, ref);
+    return *this;
+  }
 
-      /*!
-       * Returns true if the underlying data packed for a given tensor ref
-       *
-       * This is true if either:
-       *   It's column major and the rows are stride one
-       *   It's row major and the columns are stride one
-       */
-      template<camp::idx_t STRIDE_ONE_DIM>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_ref_packed() {
-        return STRIDE_ONE_DIM == 0;
-      }
+  template <typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type const& store_ref(REF_TYPE& ref) const
+  {
+    RefBridge<REF_TYPE>::store_ref(*this, ref);
+    return *this;
+  }
+
+
+  template <typename POINTER_TYPE,
+            typename INDEX_TYPE,
+            RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
+            camp::idx_t STRIDE_ONE_DIM>
+  struct RefBridge<
+      RAJA::internal::expt::
+          TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>>
+  {
 
+    using RefType = RAJA::internal::expt::
+        TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>;
 
-      /*!
-       * Gets the maximum size of matrix along specified dimension
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr camp::idx_t s_dim_elem(camp::idx_t dim){
-        return dim == 0 ? s_num_elem : 0;
-      }
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    static void load_ref(self_type& self, RefType const& ref)
+    {
 
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0];
 
-      /*!
-       * @brief Set entire vector to a single scalar value
-       * @param value Value to set all vector elements to
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(element_type value)
+      // check for packed data
+      if (STRIDE_ONE_DIM == 0)
       {
-        this->broadcast(value);
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        return this->copy(c);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_packed++;
+#endif
+          self.load_packed(ptr);
+        }
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_packed_n++;
+#endif
+          self.load_packed_n(ptr, ref.m_tile.m_size[0]);
+        }
       }
-
-      /*!
-       * Provide left vector-matrix multiply for operator* between
-       * this vector and a matrix
-       */
-      template<typename T2, typename L, typename RP>
-      self_type
-      operator*(SquareMatrixRegister<T2, L, RP> const &y) const
+      // strided data
+      else
       {
-        return y.left_vector_multiply(*this);
-      }
-
-
-      template<typename REF_TYPE>
-      struct RefBridge;
-
-
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type& load_ref (REF_TYPE const &ref){
-          RefBridge<REF_TYPE>::load_ref(*this,ref);
-          return *this;
-      }
-
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_ref (REF_TYPE &ref) const {
-          RefBridge<REF_TYPE>::store_ref(*this,ref);
-          return *this;
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_strided++;
+#endif
+          self.load_strided(ptr, ref.m_stride[0]);
+        }
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_strided_n++;
+#endif
+          self.load_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
+        }
       }
+    }
 
 
-      
-      template<typename POINTER_TYPE, typename INDEX_TYPE, RAJA::internal::expt::TensorTileSize TENSOR_SIZE, camp::idx_t STRIDE_ONE_DIM>
-      struct RefBridge <RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>>
-      {
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    static void store_ref(self_type const& self, RefType& ref)
+    {
 
-          using RefType = RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>;
-
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_HOST_DEVICE
-          RAJA_INLINE
-          static void load_ref (self_type& self, RefType const &ref){
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
-    
-            // check for packed data
-            if(STRIDE_ONE_DIM == 0){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_packed ++;
-              #endif
-                self.load_packed(ptr);
-              }
-              // partial
-              else{
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_packed_n ++;
-              #endif
-                self.load_packed_n(ptr, ref.m_tile.m_size[0]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_strided ++;
-              #endif
-                self.load_strided(ptr, ref.m_stride[0]);
-              }
-              // partial
-              else{
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_strided_n ++;
-              #endif
-                self.load_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
-              }
-            }
-          }
-
-
-
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_HOST_DEVICE
-          RAJA_INLINE
-          static void store_ref(self_type const &self, RefType &ref) {
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
-    
-            // check for packed data
-            if(STRIDE_ONE_DIM == 0){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_packed ++;
-    #endif
-                self.store_packed(ptr);
-              }
-              // partial
-              else{
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_packed_n ++;
-    #endif
-                self.store_packed_n(ptr, ref.m_tile.m_size[0]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_strided ++;
-    #endif
-                self.store_strided(ptr, ref.m_stride[0]);
-              }
-              // partial
-              else{
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_strided_n ++;
-    #endif
-                self.store_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
-              }
-            }
-          }
-           
-
-      };
-
-
-
-
-
-      
-      template<typename POINTER_TYPE, typename INDEX_TYPE, RAJA::internal::expt::TensorTileSize TENSOR_SIZE, INDEX_TYPE STRIDE_VALUE, INDEX_TYPE BEGIN_VALUE, INDEX_TYPE SIZE_VALUE, camp::idx_t STRIDE_ONE_DIM>
-      struct RefBridge <RAJA::internal::expt::StaticTensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, camp::int_seq<INDEX_TYPE,STRIDE_VALUE>, camp::int_seq<INDEX_TYPE,BEGIN_VALUE>, camp::int_seq<INDEX_TYPE,SIZE_VALUE>, STRIDE_ONE_DIM>> 
-      {
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0];
 
-          using RefType = RAJA::internal::expt::StaticTensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, camp::int_seq<INDEX_TYPE,STRIDE_VALUE>, camp::int_seq<INDEX_TYPE,BEGIN_VALUE>, camp::int_seq<INDEX_TYPE,SIZE_VALUE>, STRIDE_ONE_DIM>;
-
-          /*!
-           * @brief Performs load specified by StaticTensorRef object.
-           */
-          RAJA_HOST_DEVICE
-          RAJA_INLINE
-          static void load_ref (self_type &self, RefType const &ref){
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
-    
-            // check for packed data
-            if(STRIDE_ONE_DIM == 0){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_packed ++;
-              #endif
-                self.load_packed(ptr);
-              }
-              // partial
-              else{
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_packed_n ++;
-              #endif
-                self.load_packed_n(ptr, ref.m_tile.m_size[0]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_strided ++;
-              #endif
-                self.load_strided(ptr, ref.m_stride[0]);
-              }
-              // partial
-              else{
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_strided_n ++;
-              #endif
-                self.load_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
-              }
-            }
-          }
-
-
-
-          /*!
-           * @brief Performs load specified by StaticTensorRef object.
-           */
-          RAJA_HOST_DEVICE
-          RAJA_INLINE
-          static void store_ref(self_type const &self, RefType &ref) {
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
-    
-            // check for packed data
-            if(STRIDE_ONE_DIM == 0){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_packed ++;
-    #endif
-                self.store_packed(ptr);
-              }
-              // partial
-              else{
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_packed_n ++;
-    #endif
-                self.store_packed_n(ptr, ref.m_tile.m_size[0]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_strided ++;
-    #endif
-                self.store_strided(ptr, ref.m_stride[0]);
-              }
-              // partial
-              else{
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_strided_n ++;
-    #endif
-                self.store_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
-              }
-            }
-          }
-           
-
-      };
-     
-
-
-
-      /*!
-       * Loads a dense full vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr)
+      // check for packed data
+      if (STRIDE_ONE_DIM == 0)
       {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].load_packed(ptr+reg*s_register_num_elem);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_packed++;
+#endif
+          self.store_packed(ptr);
         }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].load_packed_n(ptr+s_final_register*s_register_num_elem, s_num_partial_lanes);
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_packed_n++;
+#endif
+          self.store_packed_n(ptr, ref.m_tile.m_size[0]);
         }
-        return *this;
       }
-
-      /*!
-       * Loads a strided full vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, int stride)
+      // strided data
+      else
       {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].load_strided(ptr+reg*s_register_num_elem*stride, stride);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_strided++;
+#endif
+          self.store_strided(ptr, ref.m_stride[0]);
         }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].load_strided_n(ptr+s_final_register*s_register_num_elem*stride, stride, s_num_partial_lanes);
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_strided_n++;
+#endif
+          self.store_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
         }
-        return *this;
       }
+    }
+  };
 
-      /*!
-       * Loads a dense partial vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, int N)
-      {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].load_packed(ptr+reg*s_register_num_elem);
-          }
-          else{
-            m_registers[reg].load_packed_n(ptr+reg*s_register_num_elem,
-                                           N-reg*s_register_num_elem);
-
-            for(camp::idx_t r = reg+1;r < s_num_full_registers;++ r){
-              m_registers[r].broadcast(0);
-            }
-            return *this;
-          }
 
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].load_packed_n(
-              ptr+s_final_register*s_register_num_elem,
-              N-s_final_register*s_register_num_elem);
-        }
-        return *this;
-      }
+  template <typename POINTER_TYPE,
+            typename INDEX_TYPE,
+            RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
+            INDEX_TYPE STRIDE_VALUE,
+            INDEX_TYPE BEGIN_VALUE,
+            INDEX_TYPE SIZE_VALUE,
+            camp::idx_t STRIDE_ONE_DIM>
+  struct RefBridge<RAJA::internal::expt::StaticTensorRef<
+      POINTER_TYPE,
+      INDEX_TYPE,
+      TENSOR_SIZE,
+      camp::int_seq<INDEX_TYPE, STRIDE_VALUE>,
+      camp::int_seq<INDEX_TYPE, BEGIN_VALUE>,
+      camp::int_seq<INDEX_TYPE, SIZE_VALUE>,
+      STRIDE_ONE_DIM>>
+  {
 
-      /*!
-       * Loads a strided partial vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr,
-          int stride, int N)
+    using RefType = RAJA::internal::expt::StaticTensorRef<
+        POINTER_TYPE,
+        INDEX_TYPE,
+        TENSOR_SIZE,
+        camp::int_seq<INDEX_TYPE, STRIDE_VALUE>,
+        camp::int_seq<INDEX_TYPE, BEGIN_VALUE>,
+        camp::int_seq<INDEX_TYPE, SIZE_VALUE>,
+        STRIDE_ONE_DIM>;
+
+    /*!
+     * @brief Performs load specified by StaticTensorRef object.
+     */
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    static void load_ref(self_type& self, RefType const& ref)
+    {
+
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0];
+
+      // check for packed data
+      if (STRIDE_ONE_DIM == 0)
       {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].load_strided(ptr+reg*s_register_num_elem*stride, stride);
-          }
-          else{
-            m_registers[reg].load_strided_n(ptr+reg*s_register_num_elem*stride,
-                                            stride,
-                                            N-reg*s_register_num_elem);
-            for(camp::idx_t r = reg+1;r < s_num_full_registers;++ r){
-              m_registers[r].broadcast(0);
-            }
-            return *this;
-          }
-
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_packed++;
+#endif
+          self.load_packed(ptr);
         }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].load_strided_n(
-              ptr+s_final_register*s_register_num_elem*stride,
-              stride,
-              N-s_final_register*s_register_num_elem);
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_packed_n++;
+#endif
+          self.load_packed_n(ptr, ref.m_tile.m_size[0]);
         }
-        return *this;
       }
-
-
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].gather(ptr, offsets.vec(reg));
+      // strided data
+      else
+      {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_strided++;
+#endif
+          self.load_strided(ptr, ref.m_stride[0]);
         }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].gather_n(ptr, offsets.vec(s_final_register), s_num_partial_lanes);
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_strided_n++;
+#endif
+          self.load_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
         }
-        return *this;
       }
+    }
 
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].gather(ptr, offsets.vec(reg));
-          }
-          else{
-            m_registers[reg].gather_n(ptr, offsets.vec(reg), N-reg*s_register_num_elem);
-            for(camp::idx_t r = reg+1;r < s_num_full_registers;++ r){
-              m_registers[r].broadcast(0);
-            }
-            return *this;
-          }
 
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].gather_n(
-              ptr,
-              offsets.vec(s_final_register),
-              N-s_final_register*s_register_num_elem);
-        }
-        return *this;
-      }
+    /*!
+     * @brief Performs load specified by StaticTensorRef object.
+     */
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    static void store_ref(self_type const& self, RefType& ref)
+    {
 
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0];
 
-      /*!
-       * Loads a dense full vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const
+      // check for packed data
+      if (STRIDE_ONE_DIM == 0)
       {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].store_packed(ptr+reg*s_register_num_elem);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_packed++;
+#endif
+          self.store_packed(ptr);
         }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].store_packed_n(ptr+s_final_register*s_register_num_elem, s_num_partial_lanes);
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_packed_n++;
+#endif
+          self.store_packed_n(ptr, ref.m_tile.m_size[0]);
         }
-        return *this;
       }
-
-      /*!
-       * Loads a strided full vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, int stride) const
+      // strided data
+      else
       {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].store_strided(ptr+reg*s_register_num_elem*stride, stride);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_strided++;
+#endif
+          self.store_strided(ptr, ref.m_stride[0]);
         }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].store_strided_n(ptr+s_final_register*s_register_num_elem*stride, stride, s_num_partial_lanes);
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_strided_n++;
+#endif
+          self.store_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
         }
-        return *this;
       }
+    }
+  };
+
+
+  /*!
+   * Loads a dense full vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      m_registers[reg].load_packed(ptr + reg * s_register_num_elem);
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].load_packed_n(
+          ptr + s_final_register * s_register_num_elem, s_num_partial_lanes);
+    }
+    return *this;
+  }
 
-      /*!
-       * Loads a dense partial vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, int N) const
+  /*!
+   * Loads a strided full vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, int stride)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      m_registers[reg].load_strided(ptr + reg * s_register_num_elem * stride,
+                                    stride);
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].load_strided_n(
+          ptr + s_final_register * s_register_num_elem * stride, stride,
+          s_num_partial_lanes);
+    }
+    return *this;
+  }
+
+  /*!
+   * Loads a dense partial vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, int N)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
+      {
+        m_registers[reg].load_packed(ptr + reg * s_register_num_elem);
+      }
+      else
       {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].store_packed(ptr+reg*s_register_num_elem);
-          }
-          else{
-            m_registers[reg].store_packed_n(ptr+reg*s_register_num_elem,
-                                           N-reg*s_register_num_elem);
-            return *this;
-          }
+        m_registers[reg].load_packed_n(ptr + reg * s_register_num_elem,
+                                       N - reg * s_register_num_elem);
 
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].store_packed_n(
-              ptr+s_final_register*s_register_num_elem,
-              N-s_final_register*s_register_num_elem);
+        for (camp::idx_t r = reg + 1; r < s_num_full_registers; ++r)
+        {
+          m_registers[r].broadcast(0);
         }
         return *this;
       }
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].load_packed_n(
+          ptr + s_final_register * s_register_num_elem,
+          N - s_final_register * s_register_num_elem);
+    }
+    return *this;
+  }
 
-      /*!
-       * Loads a strided partial vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type  *ptr,
-          int stride, int N) const
+  /*!
+   * Loads a strided partial vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_strided_n(element_type const* ptr, int stride, int N)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
       {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].store_strided(ptr+reg*s_register_num_elem*stride, stride);
-          }
-          else{
-            m_registers[reg].store_strided_n(ptr+reg*s_register_num_elem*stride,
-                                            stride,
-                                            N-reg*s_register_num_elem);
-            return *this;
-          }
-
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].store_strided_n(
-              ptr+s_final_register*s_register_num_elem*stride,
-              stride,
-              N-s_final_register*s_register_num_elem);
+        m_registers[reg].load_strided(ptr + reg * s_register_num_elem * stride,
+                                      stride);
+      }
+      else
+      {
+        m_registers[reg].load_strided_n(ptr +
+                                            reg * s_register_num_elem * stride,
+                                        stride, N - reg * s_register_num_elem);
+        for (camp::idx_t r = reg + 1; r < s_num_full_registers; ++r)
+        {
+          m_registers[r].broadcast(0);
         }
         return *this;
       }
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].load_strided_n(
+          ptr + s_final_register * s_register_num_elem * stride, stride,
+          N - s_final_register * s_register_num_elem);
+    }
+    return *this;
+  }
 
 
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type& gather(element_type const* ptr, int_vector_type offsets)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      m_registers[reg].gather(ptr, offsets.vec(reg));
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].gather_n(ptr, offsets.vec(s_final_register),
+                                             s_num_partial_lanes);
+    }
+    return *this;
+  }
 
-      /*!
-       * @brief Generic scatter operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &scatter(element_type *ptr, int_vector_type const &offsets) const {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].scatter(ptr, offsets.vec(reg));
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].scatter_n(ptr, offsets.vec(s_final_register), s_num_partial_lanes);
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  gather_n(element_type const* ptr, int_vector_type offsets, camp::idx_t N)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
+      {
+        m_registers[reg].gather(ptr, offsets.vec(reg));
+      }
+      else
+      {
+        m_registers[reg].gather_n(ptr, offsets.vec(reg),
+                                  N - reg * s_register_num_elem);
+        for (camp::idx_t r = reg + 1; r < s_num_full_registers; ++r)
+        {
+          m_registers[r].broadcast(0);
         }
         return *this;
       }
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].gather_n(ptr, offsets.vec(s_final_register),
+                                             N - s_final_register *
+                                                     s_register_num_elem);
+    }
+    return *this;
+  }
 
-      /*!
-       * @brief Generic scatter operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &scatter_n(element_type *ptr, int_vector_type const &offsets, camp::idx_t N) const {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].scatter(ptr, offsets.vec(reg));
-          }
-          else{
-            m_registers[reg].scatter_n(ptr, offsets.vec(reg), N-reg*s_register_num_elem);
-
-            return *this;
-          }
 
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].scatter_n(
-              ptr,
-              offsets.vec(s_final_register),
-              N-s_num_full_registers*s_register_num_elem);
-        }
-        return *this;
-      }
+  /*!
+   * Loads a dense full vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      m_registers[reg].store_packed(ptr + reg * s_register_num_elem);
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].store_packed_n(
+          ptr + s_final_register * s_register_num_elem, s_num_partial_lanes);
+    }
+    return *this;
+  }
 
+  /*!
+   * Loads a strided full vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, int stride) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      m_registers[reg].store_strided(ptr + reg * s_register_num_elem * stride,
+                                     stride);
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].store_strided_n(
+          ptr + s_final_register * s_register_num_elem * stride, stride,
+          s_num_partial_lanes);
+    }
+    return *this;
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &den) const {
-        self_type result;
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          result.vec(reg) = m_registers[reg].divide(den.vec(reg));
-        }
-        if(s_num_partial_lanes){
-          result.vec(s_final_register) = m_registers[s_final_register].divide_n(den.vec(s_final_register), s_num_partial_lanes);
-        }
-        return result;
+  /*!
+   * Loads a dense partial vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, int N) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
+      {
+        m_registers[reg].store_packed(ptr + reg * s_register_num_elem);
       }
-
-      /*!
-       * @brief Divide n elements of this vector by another vector
-       * @param x Vector to divide by
-       * @param n Number of elements to divide
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t n) const {
-        self_type q(*this);
-        for(camp::idx_t i = 0;i < n;++i){
-          q.set(this->get(i) / b.get(i), i);
-        }
-        return q;
+      else
+      {
+        m_registers[reg].store_packed_n(ptr + reg * s_register_num_elem,
+                                        N - reg * s_register_num_elem);
+        return *this;
       }
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].store_packed_n(
+          ptr + s_final_register * s_register_num_elem,
+          N - s_final_register * s_register_num_elem);
+    }
+    return *this;
+  }
 
-      /*!
-       * @brief Divide n elements of this vector by a scalar
-       * @param x Scalar to divide by
-       * @param n Number of elements to divide
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(element_type const &b, camp::idx_t n) const {
-        self_type q(*this);
-        for(camp::idx_t i = 0;i < n;++i){
-          q.set(this->get(i) / b, i);
-        }
-        return q;
+  /*!
+   * Loads a strided partial vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& store_strided_n(element_type* ptr, int stride, int N) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
+      {
+        m_registers[reg].store_strided(ptr + reg * s_register_num_elem * stride,
+                                       stride);
       }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type min() const
+      else
       {
-        // special case where there's just one parital register
-        if(s_num_full_registers == 0){
-          return m_registers[0].min_n(s_num_partial_lanes);
-        }
-
-        element_type result = m_registers[0].min();
-        for(camp::idx_t i = 1;i < s_num_full_registers;++ i){
-          result = RAJA::min<element_type>(result, m_registers[i].min());
-        }
-        if(s_num_partial_lanes){
-          result = RAJA::min<element_type>(result, m_registers[s_final_register].min_n(s_num_partial_lanes));
-        }
-        return result;
+        m_registers[reg].store_strided_n(ptr +
+                                             reg * s_register_num_elem * stride,
+                                         stride, N - reg * s_register_num_elem);
+        return *this;
       }
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].store_strided_n(
+          ptr + s_final_register * s_register_num_elem * stride, stride,
+          N - s_final_register * s_register_num_elem);
+    }
+    return *this;
+  }
 
-      /*!
-       * @brief Returns the smallest element over the first N lanes
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type min_n(int N) const
-      {
-        // special case where there's just one parital register
-        if(N < s_register_num_elem){
-          return m_registers[0].min_n(N);
-        }
 
-        element_type result = m_registers[0].min();
-        for(camp::idx_t reg = 1;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            result = RAJA::min<element_type>(result, m_registers[reg].min());
-          }
-          else{
-            return RAJA::min<element_type>(result, m_registers[reg].min_n(N-reg*s_register_num_elem));
-          }
-        }
-        if(N-s_num_full_registers*s_register_num_elem > 0){
-          result = RAJA::min<element_type>(result, m_registers[s_final_register].min_n(N-s_final_register*s_register_num_elem));
-        }
-        return result;
-      }
+  /*!
+   * @brief Generic scatter operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& scatter(element_type* ptr,
+                           int_vector_type const& offsets) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      m_registers[reg].scatter(ptr, offsets.vec(reg));
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].scatter_n(
+          ptr, offsets.vec(s_final_register), s_num_partial_lanes);
+    }
+    return *this;
+  }
 
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type max() const
+  /*!
+   * @brief Generic scatter operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& scatter_n(element_type* ptr,
+                             int_vector_type const& offsets,
+                             camp::idx_t N) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
       {
-        // special case where there's just one parital register
-        if(s_num_full_registers == 0){
-          return m_registers[0].max_n(s_num_partial_lanes);
-        }
-
-        element_type result = m_registers[0].max();
-        for(camp::idx_t i = 1;i < s_num_full_registers;++ i){
-          result = RAJA::max<element_type>(result, m_registers[i].max());
-        }
-        if(s_num_partial_lanes){
-          result = RAJA::max<element_type>(result, m_registers[s_final_register].max_n(s_num_partial_lanes));
-        }
-        return result;
+        m_registers[reg].scatter(ptr, offsets.vec(reg));
       }
-
-      /*!
-       * @brief Returns the largest element over the first N lanes
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type max_n(int N) const
+      else
       {
-        // special case where there's just one parital register
-        if(N < s_register_num_elem){
-          return m_registers[0].max_n(N);
-        }
+        m_registers[reg].scatter_n(ptr, offsets.vec(reg),
+                                   N - reg * s_register_num_elem);
 
-        element_type result = m_registers[0].max();
-        for(camp::idx_t reg = 1;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            result = RAJA::max<element_type>(result, m_registers[reg].max());
-          }
-          else{
-            return RAJA::max<element_type>(result, m_registers[reg].max_n(N-reg*s_register_num_elem));
-          }
-        }
-        if(N-s_num_full_registers*s_register_num_elem > 0){
-          result = RAJA::max<element_type>(result, m_registers[s_final_register].max_n(N-s_final_register*s_register_num_elem));
-        }
-        return result;
+        return *this;
       }
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].scatter_n(
+          ptr, offsets.vec(s_final_register),
+          N - s_num_full_registers * s_register_num_elem);
+    }
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& den) const
+  {
+    self_type result;
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      result.vec(reg) = m_registers[reg].divide(den.vec(reg));
+    }
+    if (s_num_partial_lanes)
+    {
+      result.vec(s_final_register) = m_registers[s_final_register].divide_n(
+          den.vec(s_final_register), s_num_partial_lanes);
+    }
+    return result;
+  }
 
-      /*!
-       * @brief Returns the sum of all elements
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type sum() const
-      {
-        // first do a vector sum of all registers
-        register_type s = m_registers[0];
-        for(camp::idx_t i = 1;i < s_num_registers;++ i){
-          s += m_registers[i];
-        }
-        // then a horizontal sum of result
-        return s.sum();
-      }
+  /*!
+   * @brief Divide n elements of this vector by another vector
+   * @param x Vector to divide by
+   * @param n Number of elements to divide
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t n) const
+  {
+    self_type q(*this);
+    for (camp::idx_t i = 0; i < n; ++i)
+    {
+      q.set(this->get(i) / b.get(i), i);
+    }
+    return q;
+  }
 
+  /*!
+   * @brief Divide n elements of this vector by a scalar
+   * @param x Scalar to divide by
+   * @param n Number of elements to divide
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(element_type const& b, camp::idx_t n) const
+  {
+    self_type q(*this);
+    for (camp::idx_t i = 0; i < n; ++i)
+    {
+      q.set(this->get(i) / b, i);
+    }
+    return q;
+  }
 
-      /*!
-       * @brief The * operator of two vectors is a element-wise multiply
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator*(self_type const &x) const {
-        return this->multiply(x);
-      }
 
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type min() const
+  {
+    // special case where there's just one parital register
+    if (s_num_full_registers == 0)
+    {
+      return m_registers[0].min_n(s_num_partial_lanes);
+    }
+
+    element_type result = m_registers[0].min();
+    for (camp::idx_t i = 1; i < s_num_full_registers; ++i)
+    {
+      result = RAJA::min<element_type>(result, m_registers[i].min());
+    }
+    if (s_num_partial_lanes)
+    {
+      result = RAJA::min<element_type>(
+          result, m_registers[s_final_register].min_n(s_num_partial_lanes));
+    }
+    return result;
+  }
 
-      /*!
-       * @brief The dot product of two vectors
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type dot(self_type const &x) const {
-        element_type dp(0);
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          dp += m_registers[i].dot(x.vec(i));
-        }
-        return dp;
+  /*!
+   * @brief Returns the smallest element over the first N lanes
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type min_n(int N) const
+  {
+    // special case where there's just one parital register
+    if (N < s_register_num_elem)
+    {
+      return m_registers[0].min_n(N);
+    }
+
+    element_type result = m_registers[0].min();
+    for (camp::idx_t reg = 1; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
+      {
+        result = RAJA::min<element_type>(result, m_registers[reg].min());
+      }
+      else
+      {
+        return RAJA::min<element_type>(
+            result, m_registers[reg].min_n(N - reg * s_register_num_elem));
       }
+    }
+    if (N - s_num_full_registers * s_register_num_elem > 0)
+    {
+      result = RAJA::min<element_type>(
+          result, m_registers[s_final_register].min_n(
+                      N - s_final_register * s_register_num_elem));
+    }
+    return result;
+  }
 
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type max() const
+  {
+    // special case where there's just one parital register
+    if (s_num_full_registers == 0)
+    {
+      return m_registers[0].max_n(s_num_partial_lanes);
+    }
+
+    element_type result = m_registers[0].max();
+    for (camp::idx_t i = 1; i < s_num_full_registers; ++i)
+    {
+      result = RAJA::max<element_type>(result, m_registers[i].max());
+    }
+    if (s_num_partial_lanes)
+    {
+      result = RAJA::max<element_type>(
+          result, m_registers[s_final_register].max_n(s_num_partial_lanes));
+    }
+    return result;
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &set(element_type val, int idx){
-        m_registers[to_register(idx)].set(val, to_lane(idx));
-        return *this;
+  /*!
+   * @brief Returns the largest element over the first N lanes
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type max_n(int N) const
+  {
+    // special case where there's just one parital register
+    if (N < s_register_num_elem)
+    {
+      return m_registers[0].max_n(N);
+    }
+
+    element_type result = m_registers[0].max();
+    for (camp::idx_t reg = 1; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
+      {
+        result = RAJA::max<element_type>(result, m_registers[reg].max());
       }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type get(int idx) const {
-        return m_registers[to_register(idx)].get(to_lane(idx));
+      else
+      {
+        return RAJA::max<element_type>(
+            result, m_registers[reg].max_n(N - reg * s_register_num_elem));
       }
+    }
+    if (N - s_num_full_registers * s_register_num_elem > 0)
+    {
+      result = RAJA::max<element_type>(
+          result, m_registers[s_final_register].max_n(
+                      N - s_final_register * s_register_num_elem));
+    }
+    return result;
+  }
 
+  /*!
+   * @brief Returns the sum of all elements
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type sum() const
+  {
+    // first do a vector sum of all registers
+    register_type s = m_registers[0];
+    for (camp::idx_t i = 1; i < s_num_registers; ++i)
+    {
+      s += m_registers[i];
+    }
+    // then a horizontal sum of result
+    return s.sum();
+  }
 
 
-      /*!
-       * @brief Converts to vector to a string
-       *
-       *
-       */
-      RAJA_INLINE
-      std::string to_string() const {
-        std::string s = "Vector(" + std::to_string(s_num_elem) + ")[ ";
+  /*!
+   * @brief The * operator of two vectors is a element-wise multiply
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator*(self_type const& x) const { return this->multiply(x); }
 
-        //
-        for(camp::idx_t i = 0;i < s_num_elem; ++ i){
-          s += std::to_string(this->get(i)) + " ";
-        }
 
-        camp::idx_t physical_size = s_num_registers * s_register_num_elem;
-        if(s_num_elem < physical_size){
-          s += "{";
-          for(camp::idx_t i = s_num_elem;i < physical_size; ++ i){
-            s += std::to_string(this->get(i)) + " ";
-          }
-          s += "}";
-        }
+  /*!
+   * @brief The dot product of two vectors
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  element_type dot(self_type const& x) const
+  {
+    element_type dp(0);
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      dp += m_registers[i].dot(x.vec(i));
+    }
+    return dp;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& set(element_type val, int idx)
+  {
+    m_registers[to_register(idx)].set(val, to_lane(idx));
+    return *this;
+  }
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  element_type get(int idx) const
+  {
+    return m_registers[to_register(idx)].get(to_lane(idx));
+  }
 
-        s += " ]\n";
 
-        return s;
+  /*!
+   * @brief Converts to vector to a string
+   *
+   *
+   */
+  RAJA_INLINE
+  std::string to_string() const
+  {
+    std::string s = "Vector(" + std::to_string(s_num_elem) + ")[ ";
+
+    //
+    for (camp::idx_t i = 0; i < s_num_elem; ++i)
+    {
+      s += std::to_string(this->get(i)) + " ";
+    }
+
+    camp::idx_t physical_size = s_num_registers * s_register_num_elem;
+    if (s_num_elem < physical_size)
+    {
+      s += "{";
+      for (camp::idx_t i = s_num_elem; i < physical_size; ++i)
+      {
+        s += std::to_string(this->get(i)) + " ";
       }
+      s += "}";
+    }
 
 
-  };
+    s += " ]\n";
+
+    return s;
+  }
+};
 
 
-} // namespace expt
+}  // namespace expt
 }  // namespace RAJA
 
 
diff --git a/include/RAJA/pattern/tensor/stats.hpp b/include/RAJA/pattern/tensor/stats.hpp
index 77b70faf00..643cd3ca22 100644
--- a/include/RAJA/pattern/tensor/stats.hpp
+++ b/include/RAJA/pattern/tensor/stats.hpp
@@ -33,7 +33,7 @@ namespace expt
 {
 struct tensor_stats
 {
-    static int indent;
+  static int indent;
 
   static camp::idx_t num_vector_copy;
   static camp::idx_t num_vector_copy_ctor;
@@ -77,10 +77,9 @@ struct tensor_stats
 
   static void resetVectorStats();
   static void printVectorStats();
-
 };
 
-} // namespace expt
-} // namespace RAJA
+}  // namespace expt
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/policy/MultiPolicy.hpp b/include/RAJA/policy/MultiPolicy.hpp
index defa08585a..26f06798cc 100644
--- a/include/RAJA/policy/MultiPolicy.hpp
+++ b/include/RAJA/policy/MultiPolicy.hpp
@@ -57,15 +57,14 @@ class MultiPolicy
 
 public:
   MultiPolicy() = delete;  // No default construction
-  MultiPolicy(Selector s) : s(s), _policies({Policies{}...}) {}
+  MultiPolicy(Selector s) : s(s), _policies({Policies {}...}) {}
   MultiPolicy(Selector s, Policies... policies) : s(s), _policies({policies...})
-  {
-  }
+  {}
 
-  MultiPolicy(const MultiPolicy &p) : s(p.s), _policies(p._policies) {}
+  MultiPolicy(const MultiPolicy& p) : s(p.s), _policies(p._policies) {}
 
   template <typename Iterable, typename Body>
-  int invoke(Iterable &&i, Body &&b)
+  int invoke(Iterable&& i, Body&& b)
   {
     size_t index = s(i);
     _policies.invoke(index, i, b);
@@ -86,9 +85,8 @@ template <typename Iterable,
           typename Body,
           typename Selector,
           typename... Policies>
-RAJA_INLINE void forall_impl(MultiPolicy<Selector, Policies...> p,
-                             Iterable &&iter,
-                             Body &&body)
+RAJA_INLINE void
+forall_impl(MultiPolicy<Selector, Policies...> p, Iterable&& iter, Body&& body)
 {
   p.invoke(iter, body);
 }
@@ -97,10 +95,11 @@ template <typename Res,
           typename Body,
           typename Selector,
           typename... Policies>
-RAJA_INLINE resources::EventProxy<Res> forall_impl(Res r,
-                                  MultiPolicy<Selector, Policies...> p,
-                                  Iterable &&iter,
-                                  Body &&body)
+RAJA_INLINE resources::EventProxy<Res>
+forall_impl(Res r,
+            MultiPolicy<Selector, Policies...> p,
+            Iterable&& iter,
+            Body&& body)
 {
   p.invoke(iter, body);
   return resources::EventProxy<Res>(r);
@@ -136,7 +135,7 @@ template <typename... Policies, typename Selector>
 RAJA_DEPRECATE("In the next RAJA Release, MultiPolicy will be deprecated.")
 auto make_multi_policy(Selector s) -> MultiPolicy<Selector, Policies...>
 {
-  return MultiPolicy<Selector, Policies...>(s, Policies{}...);
+  return MultiPolicy<Selector, Policies...>(s, Policies {}...);
 }
 
 /// make_multi_policy - Construct a MultiPolicy from the given selector and
@@ -153,15 +152,16 @@ RAJA_DEPRECATE("In the next RAJA Release, MultiPolicy will be deprecated.")
 auto make_multi_policy(std::tuple<Policies...> policies, Selector s)
     -> MultiPolicy<Selector, Policies...>
 {
-  return detail::make_multi_policy(
-      camp::make_idx_seq_t<sizeof...(Policies)>{}, s, policies);
+  return detail::make_multi_policy(camp::make_idx_seq_t<sizeof...(Policies)> {},
+                                   s, policies);
 }
 
 namespace detail
 {
 
 template <size_t index, size_t size, typename Policy, typename... rest>
-struct policy_invoker : public policy_invoker<index - 1, size, rest...> {
+struct policy_invoker : public policy_invoker<index - 1, size, rest...>
+{
   static_assert(index < size, "index must be in the range of possibilities");
   Policy _p;
   using NextInvoker = policy_invoker<index - 1, size, rest...>;
@@ -169,11 +169,12 @@ struct policy_invoker : public policy_invoker<index - 1, size, rest...> {
   policy_invoker(Policy p, rest... args) : NextInvoker(args...), _p(p) {}
 
   template <typename Iterable, typename LoopBody>
-  void invoke(int offset, Iterable &&iter, LoopBody &&loop_body)
+  void invoke(int offset, Iterable&& iter, LoopBody&& loop_body)
   {
-    if (offset == size - index - 1) {
+    if (offset == size - index - 1)
+    {
 
-      util::PluginContext context{util::make_context<Policy>()};
+      util::PluginContext context {util::make_context<Policy>()};
       util::callPreCapturePlugins(context);
 
       using RAJA::util::trigger_updates_before;
@@ -189,22 +190,27 @@ struct policy_invoker : public policy_invoker<index - 1, size, rest...> {
       forall_impl(r, _p, std::forward<Iterable>(iter), body);
 
       util::callPostLaunchPlugins(context);
-    } else {
-      NextInvoker::invoke(offset, std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
+    }
+    else
+    {
+      NextInvoker::invoke(offset, std::forward<Iterable>(iter),
+                          std::forward<LoopBody>(loop_body));
     }
   }
 };
 
 template <size_t size, typename Policy, typename... rest>
-struct policy_invoker<0, size, Policy, rest...> {
+struct policy_invoker<0, size, Policy, rest...>
+{
   Policy _p;
   policy_invoker(Policy p, rest...) : _p(p) {}
   template <typename Iterable, typename LoopBody>
-  void invoke(int offset, Iterable &&iter, LoopBody &&loop_body)
+  void invoke(int offset, Iterable&& iter, LoopBody&& loop_body)
   {
-    if (offset == size - 1) {
+    if (offset == size - 1)
+    {
 
-      util::PluginContext context{util::make_context<Policy>()};
+      util::PluginContext context {util::make_context<Policy>()};
       util::callPreCapturePlugins(context);
 
       using RAJA::util::trigger_updates_before;
@@ -214,14 +220,16 @@ struct policy_invoker<0, size, Policy, rest...> {
 
       util::callPreLaunchPlugins(context);
 
-      //std::cout <<"policy_invoker: No index\n";
+      // std::cout <<"policy_invoker: No index\n";
       using policy::multi::forall_impl;
       RAJA_FORCEINLINE_RECURSIVE
       auto r = resources::get_resource<Policy>::type::get_default();
       forall_impl(r, _p, std::forward<Iterable>(iter), body);
 
       util::callPostLaunchPlugins(context);
-    } else {
+    }
+    else
+    {
       throw std::runtime_error("unknown offset invoked");
     }
   }
@@ -234,8 +242,9 @@ namespace type_traits
 
 template <typename T>
 struct is_multi_policy
-    : ::RAJA::type_traits::SpecializationOf<RAJA::MultiPolicy, typename std::decay<T>::type> {
-};
+    : ::RAJA::type_traits::SpecializationOf<RAJA::MultiPolicy,
+                                            typename std::decay<T>::type>
+{};
 }  // namespace type_traits
 
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/PolicyBase.hpp b/include/RAJA/policy/PolicyBase.hpp
index 898c92a621..50f9a08863 100644
--- a/include/RAJA/policy/PolicyBase.hpp
+++ b/include/RAJA/policy/PolicyBase.hpp
@@ -26,7 +26,8 @@
 namespace RAJA
 {
 
-enum class Policy {
+enum class Policy
+{
   undefined,
   sequential,
   simd,
@@ -37,7 +38,8 @@ enum class Policy {
   sycl
 };
 
-enum class Pattern {
+enum class Pattern
+{
   undefined,
   forall,
   region,
@@ -52,97 +54,109 @@ enum class Pattern {
   workgroup_dispatch
 };
 
-enum class Launch { undefined, sync, async };
-
-struct PolicyBase {
+enum class Launch
+{
+  undefined,
+  sync,
+  async
 };
 
+struct PolicyBase
+{};
+
 template <Policy Policy_,
           Pattern Pattern_,
           Launch Launch_,
           Platform Platform_,
           typename... Traits>
-struct PolicyBaseT : PolicyBase {
-  static constexpr Policy policy = Policy_;
-  static constexpr Pattern pattern = Pattern_;
-  static constexpr Launch launch = Launch_;
+struct PolicyBaseT : PolicyBase
+{
+  static constexpr Policy policy     = Policy_;
+  static constexpr Pattern pattern   = Pattern_;
+  static constexpr Launch launch     = Launch_;
   static constexpr Platform platform = Platform_;
 };
 
 template <typename PolicyType>
-struct policy_of {
+struct policy_of
+{
   static constexpr Policy value = PolicyType::policy;
 };
 
 template <typename PolicyType>
-struct pattern_of {
+struct pattern_of
+{
   static constexpr Pattern value = PolicyType::pattern;
 };
 
 template <typename PolicyType>
-struct launch_of {
+struct launch_of
+{
   static constexpr Launch value = PolicyType::launch;
 };
 
 template <typename PolicyType>
-struct platform_of {
+struct platform_of
+{
   static constexpr Platform value = PolicyType::platform;
 };
 
 template <typename PolicyType, RAJA::Policy P_>
-struct policy_is : camp::num<policy_of<camp::decay<PolicyType>>::value == P_> {
-};
+struct policy_is : camp::num<policy_of<camp::decay<PolicyType>>::value == P_>
+{};
 
-template <typename PolicyType, RAJA::Policy ... Ps_>
-struct policy_any_of : camp::num<camp::concepts::any_of<policy_is<PolicyType, Ps_>...>::value> {
-};
+template <typename PolicyType, RAJA::Policy... Ps_>
+struct policy_any_of
+    : camp::num<camp::concepts::any_of<policy_is<PolicyType, Ps_>...>::value>
+{};
 
 template <typename PolicyType, RAJA::Pattern P_>
-struct pattern_is
-    : camp::num<pattern_of<camp::decay<PolicyType>>::value == P_> {
-};
+struct pattern_is : camp::num<pattern_of<camp::decay<PolicyType>>::value == P_>
+{};
 
 template <typename PolicyType, RAJA::Launch L_>
-struct launch_is : camp::num<launch_of<camp::decay<PolicyType>>::value == L_> {
-};
+struct launch_is : camp::num<launch_of<camp::decay<PolicyType>>::value == L_>
+{};
 
 template <typename PolicyType, RAJA::Platform P_>
 struct platform_is
-    : camp::num<platform_of<camp::decay<PolicyType>>::value == P_> {
-};
+    : camp::num<platform_of<camp::decay<PolicyType>>::value == P_>
+{};
 
 template <typename PolicyType, typename Trait>
-struct policy_has_trait_impl
-    : camp::num<false> {
-};
+struct policy_has_trait_impl : camp::num<false>
+{};
 ///
-template <typename Trait, Policy Policy_,
-                          Pattern Pattern_,
-                          Launch Launch_,
-                          Platform Platform_,
-                          typename... Traits>
+template <typename Trait,
+          Policy Policy_,
+          Pattern Pattern_,
+          Launch Launch_,
+          Platform Platform_,
+          typename... Traits>
 struct policy_has_trait_impl<
-      PolicyBaseT<Policy_, Pattern_, Launch_, Platform_, Traits...>, Trait>
-    : camp::num<camp::concepts::any_of<std::is_same<Trait, Traits>...>::value> {
-};
+    PolicyBaseT<Policy_, Pattern_, Launch_, Platform_, Traits...>,
+    Trait>
+    : camp::num<camp::concepts::any_of<std::is_same<Trait, Traits>...>::value>
+{};
 ///
 template <typename PolicyType, typename Trait>
 using policy_has_trait = policy_has_trait_impl<camp::decay<PolicyType>, Trait>;
 
 
 template <typename Inner>
-struct wrapper {
+struct wrapper
+{
   using inner = Inner;
 };
 
 namespace reduce
 {
 
-struct ordered {
-};
+struct ordered
+{};
 
-struct unordered {
-};
+struct unordered
+{};
 
 }  // namespace reduce
 
@@ -159,10 +173,7 @@ template <Policy Policy_,
 using make_policy_pattern_launch_platform_t =
     PolicyBaseT<Policy_, Pattern_, Launch_, Platform_, Args...>;
 
-template <Policy Policy_,
-          Pattern Pattern_,
-          Launch Launch_,
-          typename... Args>
+template <Policy Policy_, Pattern Pattern_, Launch Launch_, typename... Args>
 using make_policy_pattern_launch_t =
     PolicyBaseT<Policy_, Pattern_, Launch_, Platform::undefined, Args...>;
 
@@ -185,8 +196,8 @@ struct ExecutionPolicy
                     ::RAJA::concepts::has_type<::RAJA::Launch>(
                         camp::decay<decltype(Pol::launch)>()),
                     ::RAJA::concepts::has_type<::RAJA::Platform>(
-                        camp::decay<decltype(Pol::platform)>())) {
-};
+                        camp::decay<decltype(Pol::platform)>()))
+{};
 
 }  // end namespace concepts
 
@@ -194,44 +205,45 @@ namespace type_traits
 {
 
 template <typename Pol>
-struct is_sequential_policy : RAJA::policy_is<Pol, RAJA::Policy::sequential> {
-};
+struct is_sequential_policy : RAJA::policy_is<Pol, RAJA::Policy::sequential>
+{};
 template <typename Pol>
-struct is_simd_policy : RAJA::policy_is<Pol, RAJA::Policy::simd> {
-};
+struct is_simd_policy : RAJA::policy_is<Pol, RAJA::Policy::simd>
+{};
 template <typename Pol>
-struct is_openmp_policy : RAJA::policy_is<Pol, RAJA::Policy::openmp> {
-};
+struct is_openmp_policy : RAJA::policy_is<Pol, RAJA::Policy::openmp>
+{};
 template <typename Pol>
 struct is_target_openmp_policy
-    : RAJA::policy_is<Pol, RAJA::Policy::target_openmp> {
-};
+    : RAJA::policy_is<Pol, RAJA::Policy::target_openmp>
+{};
 template <typename Pol>
-struct is_cuda_policy : RAJA::policy_is<Pol, RAJA::Policy::cuda> {
-};
+struct is_cuda_policy : RAJA::policy_is<Pol, RAJA::Policy::cuda>
+{};
 template <typename Pol>
-struct is_hip_policy : RAJA::policy_is<Pol, RAJA::Policy::hip> {
-};
+struct is_hip_policy : RAJA::policy_is<Pol, RAJA::Policy::hip>
+{};
 template <typename Pol>
-struct is_sycl_policy : RAJA::policy_is<Pol, RAJA::Policy::sycl> {
-};
+struct is_sycl_policy : RAJA::policy_is<Pol, RAJA::Policy::sycl>
+{};
 
 template <typename Pol>
 struct is_device_exec_policy
-    : RAJA::policy_any_of<Pol, RAJA::Policy::cuda, RAJA::Policy::hip> {
-};
+    : RAJA::policy_any_of<Pol, RAJA::Policy::cuda, RAJA::Policy::hip>
+{};
 
 DefineTypeTraitFromConcept(is_execution_policy,
                            RAJA::concepts::ExecutionPolicy);
 
 
 template <typename Pol>
-struct is_reduce_policy : RAJA::pattern_is<Pol, RAJA::Pattern::reduce> {
-};
+struct is_reduce_policy : RAJA::pattern_is<Pol, RAJA::Pattern::reduce>
+{};
 
 template <typename Pol>
-struct is_multi_reduce_policy : RAJA::pattern_is<Pol, RAJA::Pattern::multi_reduce> {
-};
+struct is_multi_reduce_policy
+    : RAJA::pattern_is<Pol, RAJA::Pattern::multi_reduce>
+{};
 
 }  // end namespace type_traits
 
diff --git a/include/RAJA/policy/WorkGroup.hpp b/include/RAJA/policy/WorkGroup.hpp
index cae78d2493..4e1779bb39 100644
--- a/include/RAJA/policy/WorkGroup.hpp
+++ b/include/RAJA/policy/WorkGroup.hpp
@@ -39,74 +39,75 @@ namespace workgroup
 /// Note this is intended for debugging, the WorkGroup abstraction is intended
 /// to allow running loops in an unordered fashion (loop fusion)
 struct ordered
-    : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_order> {
-};
+    : RAJA::make_policy_pattern_t<Policy::undefined, Pattern::workgroup_order>
+{};
 /// execute the enqueued loops in the reverse order from the order that they
 /// were enqueued
 /// Note this is intended for debugging, the WorkGroup abstraction is intended
 /// to allow running loops in an unordered fashion (loop fusion)
 struct reverse_ordered
-    : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_order> {
-};
+    : RAJA::make_policy_pattern_t<Policy::undefined, Pattern::workgroup_order>
+{};
 
 /// store an array of pointers to the enqueued objects. The enqueued objects
 /// are stored in separate allocations.
 struct array_of_pointers
-    : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_storage> {
-};
+    : RAJA::make_policy_pattern_t<Policy::undefined, Pattern::workgroup_storage>
+{};
 /// store an array of pointers to the enqueued objects. The enqueued objects
 /// are stored in a single compact array.
 struct ragged_array_of_objects
-    : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_storage> {
-};
+    : RAJA::make_policy_pattern_t<Policy::undefined, Pattern::workgroup_storage>
+{};
 /// store an array of the enqueued objects with padding such that the objects
 /// can be accessed using a constant stride from the beginning of the array.
 struct constant_stride_array_of_objects
-    : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_storage> {
-};
+    : RAJA::make_policy_pattern_t<Policy::undefined, Pattern::workgroup_storage>
+{};
 
 /// Dispatch using function pointers to make indirect function calls
 struct indirect_function_call_dispatch
     : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_dispatch> {
-};
+                                  Pattern::workgroup_dispatch>
+{};
 /// Dispatch using virtual functions to make indirect function calls
 struct indirect_virtual_function_dispatch
     : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_dispatch> {
-};
+                                  Pattern::workgroup_dispatch>
+{};
 /// Dispatch using an implementation equivalent to a switch statement to select
 /// the type from RangeAndCallables and directly call the object.
 /// RangeAndCallables is a pack of types of the form camp::list<Range, Callable>
 /// where pairs of Range and Callable are the types of the range and callable
 /// objects that may be passed to WorkPool enqueue.
-template < typename ... RangeAndCallables >
+template <typename... RangeAndCallables>
 struct direct_dispatch
     : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_dispatch> {
-};
-
-template < typename EXEC_POLICY_T,
-           typename ORDER_POLICY_T,
-           typename STORAGE_POLICY_T,
-           typename DISPATCH_POLICY_T = indirect_function_call_dispatch >
-struct WorkGroupPolicy
-    : public RAJA::make_policy_pattern_platform_t<
-                       policy_of<EXEC_POLICY_T>::value,
-                       Pattern::workgroup,
-                       platform_of<EXEC_POLICY_T>::value> {
-  static_assert(RAJA::pattern_is<EXEC_POLICY_T, RAJA::Pattern::workgroup_exec>::value,
+                                  Pattern::workgroup_dispatch>
+{};
+
+template <typename EXEC_POLICY_T,
+          typename ORDER_POLICY_T,
+          typename STORAGE_POLICY_T,
+          typename DISPATCH_POLICY_T = indirect_function_call_dispatch>
+struct WorkGroupPolicy : public RAJA::make_policy_pattern_platform_t<
+                             policy_of<EXEC_POLICY_T>::value,
+                             Pattern::workgroup,
+                             platform_of<EXEC_POLICY_T>::value>
+{
+  static_assert(
+      RAJA::pattern_is<EXEC_POLICY_T, RAJA::Pattern::workgroup_exec>::value,
       "WorkGroupPolicy: EXEC_POLICY_T must be a workgroup exec policy");
-  static_assert(RAJA::pattern_is<ORDER_POLICY_T, RAJA::Pattern::workgroup_order>::value,
+  static_assert(
+      RAJA::pattern_is<ORDER_POLICY_T, RAJA::Pattern::workgroup_order>::value,
       "WorkGroupPolicy: ORDER_POLICY_T must be a workgroup order policy");
-  static_assert(RAJA::pattern_is<STORAGE_POLICY_T, RAJA::Pattern::workgroup_storage>::value,
+  static_assert(
+      RAJA::pattern_is<STORAGE_POLICY_T,
+                       RAJA::Pattern::workgroup_storage>::value,
       "WorkGroupPolicy: STORAGE_POLICY_T must be a workgroup storage policy");
-  static_assert(RAJA::pattern_is<DISPATCH_POLICY_T, RAJA::Pattern::workgroup_dispatch>::value,
+  static_assert(
+      RAJA::pattern_is<DISPATCH_POLICY_T,
+                       RAJA::Pattern::workgroup_dispatch>::value,
       "WorkGroupPolicy: DISPATCH_POLICY_T must be a workgroup dispatch policy");
 };
 
@@ -117,12 +118,12 @@ using policy::workgroup::ordered;
 using policy::workgroup::reverse_ordered;
 
 using policy::workgroup::array_of_pointers;
-using policy::workgroup::ragged_array_of_objects;
 using policy::workgroup::constant_stride_array_of_objects;
+using policy::workgroup::ragged_array_of_objects;
 
+using policy::workgroup::direct_dispatch;
 using policy::workgroup::indirect_function_call_dispatch;
 using policy::workgroup::indirect_virtual_function_dispatch;
-using policy::workgroup::direct_dispatch;
 
 using policy::workgroup::WorkGroupPolicy;
 
diff --git a/include/RAJA/policy/atomic_auto.hpp b/include/RAJA/policy/atomic_auto.hpp
index e0ca557b32..ee859b4a91 100644
--- a/include/RAJA/policy/atomic_auto.hpp
+++ b/include/RAJA/policy/atomic_auto.hpp
@@ -23,7 +23,7 @@
 #include "RAJA/util/macros.hpp"
 
 #if !defined(RAJA_ENABLE_DESUL_ATOMICS)
-    #include "RAJA/policy/sequential/atomic.hpp"
+#include "RAJA/policy/sequential/atomic.hpp"
 #endif
 
 /*!
@@ -39,19 +39,19 @@
  * because we assume there is no thread safety issues (no parallel model)
  */
 #if defined(__CUDA_ARCH__) && defined(RAJA_CUDA_ACTIVE)
-#define RAJA_AUTO_ATOMIC \
+#define RAJA_AUTO_ATOMIC                                                       \
   RAJA::cuda_atomic {}
 #elif defined(__HIP_DEVICE_COMPILE__) && defined(RAJA_HIP_ACTIVE)
-#define RAJA_AUTO_ATOMIC \
+#define RAJA_AUTO_ATOMIC                                                       \
   RAJA::hip_atomic {}
 #elif defined(__SYCL_DEVICE_ONLY__)
-#define RAJA_AUTO_ATOMIC \
+#define RAJA_AUTO_ATOMIC                                                       \
   RAJA::sycl_atomic {}
 #elif defined(RAJA_ENABLE_OPENMP)
-#define RAJA_AUTO_ATOMIC \
+#define RAJA_AUTO_ATOMIC                                                       \
   RAJA::omp_atomic {}
 #else
-#define RAJA_AUTO_ATOMIC \
+#define RAJA_AUTO_ATOMIC                                                       \
   RAJA::seq_atomic {}
 #endif
 
@@ -60,102 +60,96 @@ namespace RAJA
 {
 
 //! Atomic policy that automatically does "the right thing"
-struct auto_atomic {
-};
+struct auto_atomic
+{};
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(auto_atomic, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(auto_atomic, T* acc)
 {
   return atomicLoad(RAJA_AUTO_ATOMIC, acc);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(auto_atomic, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(auto_atomic, T* acc, T value)
 {
   atomicStore(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(auto_atomic, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(auto_atomic, T* acc, T value)
 {
   return atomicAdd(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(auto_atomic, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(auto_atomic, T* acc, T value)
 {
   return atomicSub(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(auto_atomic, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(auto_atomic, T* acc, T value)
 {
   return atomicMin(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(auto_atomic, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(auto_atomic, T* acc, T value)
 {
   return atomicMax(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(auto_atomic, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(auto_atomic, T* acc)
 {
   return atomicInc(RAJA_AUTO_ATOMIC, acc);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(auto_atomic,
-                                         T *acc,
-                                         T compare)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(auto_atomic, T* acc, T compare)
 {
   return atomicInc(RAJA_AUTO_ATOMIC, acc, compare);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(auto_atomic, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(auto_atomic, T* acc)
 {
   return atomicDec(RAJA_AUTO_ATOMIC, acc);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(auto_atomic,
-                                         T *acc,
-                                         T compare)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(auto_atomic, T* acc, T compare)
 {
   return atomicDec(RAJA_AUTO_ATOMIC, acc, compare);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(auto_atomic, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(auto_atomic, T* acc, T value)
 {
   return atomicAnd(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(auto_atomic, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(auto_atomic, T* acc, T value)
 {
   return atomicOr(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(auto_atomic, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(auto_atomic, T* acc, T value)
 {
   return atomicXor(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(auto_atomic,
-                                              T *acc,
-                                              T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(auto_atomic, T* acc, T value)
 {
   return atomicExchange(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicCAS(auto_atomic, T *acc, T compare, T value)
+atomicCAS(auto_atomic, T* acc, T compare, T value)
 {
   return atomicCAS(RAJA_AUTO_ATOMIC, acc, compare, value);
 }
diff --git a/include/RAJA/policy/atomic_builtin.hpp b/include/RAJA/policy/atomic_builtin.hpp
index e43bd71386..742aaa25b8 100644
--- a/include/RAJA/policy/atomic_builtin.hpp
+++ b/include/RAJA/policy/atomic_builtin.hpp
@@ -22,7 +22,8 @@
 
 #include <cstdint>
 
-#if defined(RAJA_COMPILER_MSVC) || ((defined(_WIN32) || defined(_WIN64)) && defined(__INTEL_COMPILER))
+#if defined(RAJA_COMPILER_MSVC) ||                                             \
+    ((defined(_WIN32) || defined(_WIN64)) && defined(__INTEL_COMPILER))
 #include <intrin.h>
 #endif
 
@@ -41,14 +42,16 @@ namespace RAJA
 
 
 //! Atomic policy that uses the compilers builtin __atomic_XXX routines
-struct builtin_atomic {
-};
+struct builtin_atomic
+{};
 
 
-namespace detail {
+namespace detail
+{
 
 
-#if defined(RAJA_COMPILER_MSVC) || ((defined(_WIN32) || defined(_WIN64)) && defined(__INTEL_COMPILER))
+#if defined(RAJA_COMPILER_MSVC) ||                                             \
+    ((defined(_WIN32) || defined(_WIN64)) && defined(__INTEL_COMPILER))
 
 
 /*!
@@ -56,12 +59,11 @@ namespace detail {
  * using an intrinsic
  */
 template <typename T>
-struct builtin_useIntrinsic {
+struct builtin_useIntrinsic
+{
   static constexpr bool value =
-    std::is_same<T, char>::value ||
-    std::is_same<T, short>::value ||
-    std::is_same<T, long>::value ||
-    std::is_same<T, long long>::value;
+      std::is_same<T, char>::value || std::is_same<T, short>::value ||
+      std::is_same<T, long>::value || std::is_same<T, long long>::value;
 };
 
 
@@ -70,18 +72,18 @@ struct builtin_useIntrinsic {
  * by reinterpreting inputs to types that intrinsics support
  */
 template <typename T>
-struct builtin_useReinterpret {
+struct builtin_useReinterpret
+{
   static constexpr bool value =
-    !builtin_useIntrinsic<T>::value &&
-    (sizeof(T) == 1 ||
-     sizeof(T) == 2 ||
-     sizeof(T) == 4 ||
-     sizeof(T) == 8);
-
-  using type =
-    std::conditional_t<sizeof(T) == 1, char,
-    std::conditional_t<sizeof(T) == 2, short,
-    std::conditional_t<sizeof(T) == 4, long, long long>>>;
+      !builtin_useIntrinsic<T>::value &&
+      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+
+  using type = std::conditional_t<
+      sizeof(T) == 1,
+      char,
+      std::conditional_t<sizeof(T) == 2,
+                         short,
+                         std::conditional_t<sizeof(T) == 4, long, long long>>>;
 };
 
 
@@ -90,10 +92,11 @@ struct builtin_useReinterpret {
  * using a compare and swap loop
  */
 template <typename T>
-struct builtin_useCAS {
+struct builtin_useCAS
+{
   static constexpr bool value =
-    !builtin_useIntrinsic<T>::value &&
-    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+      !builtin_useIntrinsic<T>::value &&
+      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
 };
 
 
@@ -105,24 +108,24 @@ struct builtin_useCAS {
 /*!
  * Atomic or using intrinsics
  */
-RAJA_INLINE char builtin_atomicOr(char *acc, char value)
+RAJA_INLINE char builtin_atomicOr(char* acc, char value)
 {
   return _InterlockedOr8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicOr(short *acc, short value)
+RAJA_INLINE short builtin_atomicOr(short* acc, short value)
 {
   return _InterlockedOr16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicOr(long *acc, long value)
+RAJA_INLINE long builtin_atomicOr(long* acc, long value)
 {
   return _InterlockedOr(acc, value);
 }
 
 #if defined(_WIN64)
 
-RAJA_INLINE long long builtin_atomicOr(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicOr(long long* acc, long long value)
 {
   return _InterlockedOr64(acc, value);
 }
@@ -134,7 +137,7 @@ RAJA_INLINE long long builtin_atomicOr(long long *acc, long long value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_INLINE T builtin_atomicLoad(T *acc)
+RAJA_INLINE T builtin_atomicLoad(T* acc)
 {
   return builtin_atomicOr(acc, static_cast<T>(0));
 }
@@ -143,24 +146,24 @@ RAJA_INLINE T builtin_atomicLoad(T *acc)
 /*!
  * Atomic exchange using intrinsics
  */
-RAJA_INLINE char builtin_atomicExchange(char *acc, char value)
+RAJA_INLINE char builtin_atomicExchange(char* acc, char value)
 {
   return _InterlockedExchange8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicExchange(short *acc, short value)
+RAJA_INLINE short builtin_atomicExchange(short* acc, short value)
 {
   return _InterlockedExchange16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicExchange(long *acc, long value)
+RAJA_INLINE long builtin_atomicExchange(long* acc, long value)
 {
   return _InterlockedExchange(acc, value);
 }
 
 #if defined(_WIN64)
 
-RAJA_INLINE long long builtin_atomicExchange(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicExchange(long long* acc, long long value)
 {
   return _InterlockedExchange64(acc, value);
 }
@@ -173,7 +176,7 @@ RAJA_INLINE long long builtin_atomicExchange(long long *acc, long long value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_INLINE void builtin_atomicStore(T *acc, T value)
+RAJA_INLINE void builtin_atomicStore(T* acc, T value)
 {
   builtin_atomicExchange(acc, value);
 }
@@ -182,24 +185,25 @@ RAJA_INLINE void builtin_atomicStore(T *acc, T value)
 /*!
  * Atomic compare and swap using intrinsics
  */
-RAJA_INLINE char builtin_atomicCAS(char *acc, char compare, char value)
+RAJA_INLINE char builtin_atomicCAS(char* acc, char compare, char value)
 {
   return _InterlockedCompareExchange8(acc, value, compare);
 }
 
-RAJA_INLINE short builtin_atomicCAS(short *acc, short compare, short value)
+RAJA_INLINE short builtin_atomicCAS(short* acc, short compare, short value)
 {
   return _InterlockedCompareExchange16(acc, value, compare);
 }
 
-RAJA_INLINE long builtin_atomicCAS(long *acc, long compare, long value)
+RAJA_INLINE long builtin_atomicCAS(long* acc, long compare, long value)
 {
   return _InterlockedCompareExchange(acc, value, compare);
 }
 
 #if defined(_WIN64)
 
-RAJA_INLINE long long builtin_atomicCAS(long long *acc, long long compare, long long value)
+RAJA_INLINE long long
+builtin_atomicCAS(long long* acc, long long compare, long long value)
 {
   return _InterlockedCompareExchange64(acc, value, compare);
 }
@@ -210,24 +214,24 @@ RAJA_INLINE long long builtin_atomicCAS(long long *acc, long long compare, long
 /*!
  * Atomic addition using intrinsics
  */
-RAJA_INLINE char builtin_atomicAdd(char *acc, char value)
+RAJA_INLINE char builtin_atomicAdd(char* acc, char value)
 {
   return _InterlockedExchangeAdd8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicAdd(short *acc, short value)
+RAJA_INLINE short builtin_atomicAdd(short* acc, short value)
 {
   return _InterlockedExchangeAdd16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicAdd(long *acc, long value)
+RAJA_INLINE long builtin_atomicAdd(long* acc, long value)
 {
   return _InterlockedExchangeAdd(acc, value);
 }
 
 #if defined(_WIN64)
 
-RAJA_INLINE long long builtin_atomicAdd(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicAdd(long long* acc, long long value)
 {
   return _InterlockedExchangeAdd64(acc, value);
 }
@@ -238,24 +242,24 @@ RAJA_INLINE long long builtin_atomicAdd(long long *acc, long long value)
 /*!
  * Atomic subtraction using intrinsics
  */
-RAJA_INLINE char builtin_atomicSub(char *acc, char value)
+RAJA_INLINE char builtin_atomicSub(char* acc, char value)
 {
   return _InterlockedExchangeAdd8(acc, -value);
 }
 
-RAJA_INLINE short builtin_atomicSub(short *acc, short value)
+RAJA_INLINE short builtin_atomicSub(short* acc, short value)
 {
   return _InterlockedExchangeAdd16(acc, -value);
 }
 
-RAJA_INLINE long builtin_atomicSub(long *acc, long value)
+RAJA_INLINE long builtin_atomicSub(long* acc, long value)
 {
   return _InterlockedExchangeAdd(acc, -value);
 }
 
 #if defined(_WIN64)
 
-RAJA_INLINE long long builtin_atomicSub(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicSub(long long* acc, long long value)
 {
   return _InterlockedExchangeAdd64(acc, -value);
 }
@@ -266,24 +270,24 @@ RAJA_INLINE long long builtin_atomicSub(long long *acc, long long value)
 /*!
  * Atomic and using intrinsics
  */
-RAJA_INLINE char builtin_atomicAnd(char *acc, char value)
+RAJA_INLINE char builtin_atomicAnd(char* acc, char value)
 {
   return _InterlockedAnd8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicAnd(short *acc, short value)
+RAJA_INLINE short builtin_atomicAnd(short* acc, short value)
 {
   return _InterlockedAnd16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicAnd(long *acc, long value)
+RAJA_INLINE long builtin_atomicAnd(long* acc, long value)
 {
   return _InterlockedAnd(acc, value);
 }
 
 #if defined(_WIN64)
 
-RAJA_INLINE long long builtin_atomicAnd(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicAnd(long long* acc, long long value)
 {
   return _InterlockedAnd64(acc, value);
 }
@@ -294,24 +298,24 @@ RAJA_INLINE long long builtin_atomicAnd(long long *acc, long long value)
 /*!
  * Atomic xor using intrinsics
  */
-RAJA_INLINE char builtin_atomicXor(char *acc, char value)
+RAJA_INLINE char builtin_atomicXor(char* acc, char value)
 {
   return _InterlockedXor8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicXor(short *acc, short value)
+RAJA_INLINE short builtin_atomicXor(short* acc, short value)
 {
   return _InterlockedXor16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicXor(long *acc, long value)
+RAJA_INLINE long builtin_atomicXor(long* acc, long value)
 {
   return _InterlockedXor(acc, value);
 }
 
 #if defined(_WIN64)
 
-RAJA_INLINE long long builtin_atomicXor(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicXor(long long* acc, long long value)
 {
   return _InterlockedXor64(acc, value);
 }
@@ -327,10 +331,11 @@ RAJA_INLINE long long builtin_atomicXor(long long *acc, long long value)
  * using an intrinsic
  */
 template <typename T>
-struct builtin_useIntrinsic {
+struct builtin_useIntrinsic
+{
   static constexpr bool value =
-    (std::is_integral<T>::value || std::is_enum<T>::value) &&
-    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+      (std::is_integral<T>::value || std::is_enum<T>::value) &&
+      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
 };
 
 
@@ -339,54 +344,54 @@ struct builtin_useIntrinsic {
  * by reinterpreting inputs to types that intrinsics support
  */
 template <typename T>
-struct builtin_useReinterpret {
-  static constexpr bool value =
-    !std::is_integral<T>::value &&
-    !std::is_enum<T>::value &&
-    ((sizeof(T) == 1
+struct builtin_useReinterpret
+{
+  static constexpr bool value = !std::is_integral<T>::value &&
+                                !std::is_enum<T>::value &&
+                                ((sizeof(T) == 1
 #if !defined(UINT8_MAX)
-      && sizeof(unsigned char) == 1
+                                  && sizeof(unsigned char) == 1
 #endif
-     ) ||
-     (sizeof(T) == 2
+                                  ) ||
+                                 (sizeof(T) == 2
 #if !defined(UINT16_MAX)
-      && sizeof(unsigned short) == 2
+                                  && sizeof(unsigned short) == 2
 #endif
-     ) ||
-     (sizeof(T) == 4
+                                  ) ||
+                                 (sizeof(T) == 4
 #if !defined(UINT32_MAX)
-      && sizeof(unsigned int) == 4
+                                  && sizeof(unsigned int) == 4
 #endif
-     ) ||
-     (sizeof(T) == 8
+                                  ) ||
+                                 (sizeof(T) == 8
 #if !defined(UINT64_MAX)
-      && sizeof(unsigned long long) == 8
+                                  && sizeof(unsigned long long) == 8
 #endif
-     ));
+                                  ));
 
   using type =
-    std::conditional_t<sizeof(T) == 1,
+      std::conditional_t<sizeof(T) == 1,
 #if defined(UINT8_MAX)
-                       uint8_t,
+                         uint8_t,
 #else
-                       unsigned char,
+                         unsigned char,
 #endif
-    std::conditional_t<sizeof(T) == 2,
+                         std::conditional_t<sizeof(T) == 2,
 #if defined(UINT16_MAX)
-                       uint16_t,
+                                            uint16_t,
 #else
-                       unsigned short,
+                                            unsigned short,
 #endif
-    std::conditional_t<sizeof(T) == 4,
+                                            std::conditional_t<sizeof(T) == 4,
 #if defined(UINT32_MAX)
-                       uint32_t,
+                                                               uint32_t,
 #else
-                       unsigned int,
+                                                               unsigned int,
 #endif
 #if defined(UINT64_MAX)
-                       uint64_t>>>;
+                                                               uint64_t>>>;
 #else
-                       unsigned long long>>>;
+                                                               unsigned long long>>>;
 #endif
 };
 
@@ -396,10 +401,11 @@ struct builtin_useReinterpret {
  * using a compare and swap loop
  */
 template <typename T>
-struct builtin_useCAS {
+struct builtin_useCAS
+{
   static constexpr bool value =
-    !std::is_integral<T>::value && !std::is_enum<T>::value &&
-    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+      !std::is_integral<T>::value && !std::is_enum<T>::value &&
+      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
 };
 
 
@@ -413,7 +419,7 @@ struct builtin_useCAS {
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T *acc)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T* acc)
 {
   return __atomic_load_n(acc, __ATOMIC_RELAXED);
 }
@@ -424,7 +430,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T *acc)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T* acc, T value)
 {
   __atomic_store_n(acc, value, __ATOMIC_RELAXED);
 }
@@ -435,7 +441,7 @@ RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T* acc, T value)
 {
   return __atomic_exchange_n(acc, value, __ATOMIC_RELAXED);
 }
@@ -446,10 +452,10 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T* acc, T compare, T value)
 {
-  __atomic_compare_exchange_n(
-      acc, &compare, value, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+  __atomic_compare_exchange_n(acc, &compare, value, false, __ATOMIC_RELAXED,
+                              __ATOMIC_RELAXED);
   return compare;
 }
 
@@ -459,7 +465,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T* acc, T value)
 {
   return __atomic_fetch_add(acc, value, __ATOMIC_RELAXED);
 }
@@ -470,7 +476,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T* acc, T value)
 {
   return __atomic_fetch_sub(acc, value, __ATOMIC_RELAXED);
 }
@@ -481,7 +487,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T* acc, T value)
 {
   return __atomic_fetch_and(acc, value, __ATOMIC_RELAXED);
 }
@@ -492,7 +498,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T* acc, T value)
 {
   return __atomic_fetch_or(acc, value, __ATOMIC_RELAXED);
 }
@@ -503,7 +509,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T* acc, T value)
 {
   return __atomic_fetch_xor(acc, value, __ATOMIC_RELAXED);
 }
@@ -529,12 +535,12 @@ using builtin_useReinterpret_t = typename builtin_useReinterpret<T>::type;
  */
 template <typename T,
           std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T *acc)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T* acc)
 {
   using R = builtin_useReinterpret_t<T>;
 
   return RAJA::util::reinterp_A_as_B<R, T>(
-    builtin_atomicLoad(reinterpret_cast<R*>(acc)));
+      builtin_atomicLoad(reinterpret_cast<R*>(acc)));
 }
 
 
@@ -543,7 +549,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T *acc)
  */
 template <typename T,
           std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T* acc, T value)
 {
   using R = builtin_useReinterpret_t<T>;
 
@@ -557,13 +563,12 @@ RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T* acc, T value)
 {
   using R = builtin_useReinterpret_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(
-    builtin_atomicExchange(reinterpret_cast<R*>(acc),
-                           RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(builtin_atomicExchange(
+      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
@@ -572,14 +577,13 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T* acc, T compare, T value)
 {
   using R = builtin_useReinterpret_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(
-    builtin_atomicCAS(reinterpret_cast<R*>(acc),
-                      RAJA::util::reinterp_A_as_B<T, R>(compare),
-                      RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(builtin_atomicCAS(
+      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(compare),
+      RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
@@ -594,7 +598,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b)
+RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T& a, const T& b)
 {
   return a == b;
 }
@@ -607,7 +611,7 @@ RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b)
  */
 template <typename T,
           std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b)
+RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T& a, const T& b)
 {
   using R = builtin_useReinterpret_t<T>;
 
@@ -622,15 +626,15 @@ RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b)
  * Returns the OLD value that was replaced by the result of this operation.
  */
 template <typename T, typename Oper>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc,
-                                                     Oper &&oper)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T* acc, Oper&& oper)
 {
   T old = builtin_atomicLoad(acc);
   T expected;
 
-  do {
+  do
+  {
     expected = old;
-    old = builtin_atomicCAS(acc, expected, oper(expected));
+    old      = builtin_atomicCAS(acc, expected, oper(expected));
   } while (!builtin_atomicCAS_equal(old, expected));
 
   return old;
@@ -644,21 +648,23 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc,
  * that was replaced by the result of this operation.
  */
 template <typename T, typename Oper, typename ShortCircuit>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc,
-                                                     Oper &&oper,
-                                                     ShortCircuit &&sc)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T* acc,
+                                                     Oper&& oper,
+                                                     ShortCircuit&& sc)
 {
   T old = builtin_atomicLoad(acc);
 
-  if (sc(old)) {
+  if (sc(old))
+  {
     return old;
   }
 
   T expected;
 
-  do {
+  do
+  {
     expected = old;
-    old = builtin_atomicCAS(acc, expected, oper(expected));
+    old      = builtin_atomicCAS(acc, expected, oper(expected));
   } while (!builtin_atomicCAS_equal(old, expected) && !sc(old));
 
   return old;
@@ -673,65 +679,50 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc,
 /*!
  * Atomic addition using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T *acc, T value)
+template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T* acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old + value;
-  });
+  return builtin_atomicCAS_loop(acc, [value](T old) { return old + value; });
 }
 
 
 /*!
  * Atomic subtraction using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T *acc, T value)
+template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T* acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old - value;
-  });
+  return builtin_atomicCAS_loop(acc, [value](T old) { return old - value; });
 }
 
 
 /*!
  * Atomic and using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T *acc, T value)
+template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T* acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old & value;
-  });
+  return builtin_atomicCAS_loop(acc, [value](T old) { return old & value; });
 }
 
 
 /*!
  * Atomic or using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T *acc, T value)
+template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T* acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old | value;
-  });
+  return builtin_atomicCAS_loop(acc, [value](T old) { return old | value; });
 }
 
 
 /*!
  * Atomic xor using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T *acc, T value)
+template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T* acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old ^ value;
-  });
+  return builtin_atomicCAS_loop(acc, [value](T old) { return old ^ value; });
 }
 
 
@@ -739,109 +730,105 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T *acc, T value)
 
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicLoad(builtin_atomic, T *acc)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicLoad(builtin_atomic, T* acc)
 {
   return detail::builtin_atomicLoad(acc);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE void atomicStore(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE void atomicStore(builtin_atomic, T* acc, T value)
 {
   detail::builtin_atomicStore(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicAdd(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicAdd(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicAdd(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicSub(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicSub(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicSub(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicMin(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicMin(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return value < old ? value : old;
-    },
-    [value] (T current) {
-      return current <= value;
-    });
+      acc, [value](T old) { return value < old ? value : old; },
+      [value](T current) { return current <= value; });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicMax(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicMax(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return old < value ? value : old;
-    },
-    [value] (T current) {
-      return value <= current;
-    });
+      acc, [value](T old) { return old < value ? value : old; },
+      [value](T current) { return value <= current; });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T *acc)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T* acc)
 {
   return detail::builtin_atomicAdd(acc, static_cast<T>(1));
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T* acc, T value)
 {
-  return detail::builtin_atomicCAS_loop(acc, [value] (T old) {
-    return value <= old ? static_cast<T>(0) : old + static_cast<T>(1);
-  });
+  return detail::builtin_atomicCAS_loop(
+      acc, [value](T old)
+      { return value <= old ? static_cast<T>(0) : old + static_cast<T>(1); });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T *acc)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T* acc)
 {
   return detail::builtin_atomicSub(acc, static_cast<T>(1));
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T* acc, T value)
 {
-  return detail::builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old == static_cast<T>(0) || value < old ? value : old - static_cast<T>(1);
-  });
+  return detail::builtin_atomicCAS_loop(acc,
+                                        [value](T old)
+                                        {
+                                          return old == static_cast<T>(0) ||
+                                                         value < old
+                                                     ? value
+                                                     : old - static_cast<T>(1);
+                                        });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicAnd(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicAnd(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicAnd(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicOr(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicOr(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicOr(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicXor(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicXor(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicXor(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicExchange(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicExchange(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicExchange(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicCAS(builtin_atomic, T *acc, T compare, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T
+atomicCAS(builtin_atomic, T* acc, T compare, T value)
 {
   return detail::builtin_atomicCAS(acc, compare, value);
 }
diff --git a/include/RAJA/policy/cuda.hpp b/include/RAJA/policy/cuda.hpp
index e9d5bc454f..40d5e68e4c 100644
--- a/include/RAJA/policy/cuda.hpp
+++ b/include/RAJA/policy/cuda.hpp
@@ -28,7 +28,7 @@
 #include <cuda_runtime.h>
 
 #if !defined(RAJA_ENABLE_DESUL_ATOMICS)
-    #include "RAJA/policy/cuda/atomic.hpp"
+#include "RAJA/policy/cuda/atomic.hpp"
 #endif
 
 #include "RAJA/policy/cuda/forall.hpp"
diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
index 88a89d5362..3fc1e4b90c 100644
--- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
+++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
@@ -71,7 +71,8 @@ cudaDeviceProp& device_prop()
 
 
 //! Allocator for pinned memory for use in basic_mempool
-struct PinnedAllocator {
+struct PinnedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -90,7 +91,8 @@ struct PinnedAllocator {
 };
 
 //! Allocator for device memory for use in basic_mempool
-struct DeviceAllocator {
+struct DeviceAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -110,7 +112,8 @@ struct DeviceAllocator {
 
 //! Allocator for pre-zeroed device memory for use in basic_mempool
 //  Note: Memory must be zero when returned to mempool
-struct DeviceZeroedAllocator {
+struct DeviceZeroedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -132,7 +135,8 @@ struct DeviceZeroedAllocator {
 };
 
 //! Allocator for device pinned memory for use in basic_mempool
-struct DevicePinnedAllocator {
+struct DevicePinnedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -141,8 +145,10 @@ struct DevicePinnedAllocator {
     cudaErrchk(cudaGetDevice(&device));
     void* ptr;
     cudaErrchk(cudaMallocManaged(&ptr, nbytes, cudaMemAttachGlobal));
-    cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetPreferredLocation, device));
-    cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId));
+    cudaErrchk(
+        cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetPreferredLocation, device));
+    cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetAccessedBy,
+                             cudaCpuDeviceId));
 
     return ptr;
   }
@@ -158,22 +164,25 @@ struct DevicePinnedAllocator {
 using device_mempool_type = basic_mempool::MemPool<DeviceAllocator>;
 using device_zeroed_mempool_type =
     basic_mempool::MemPool<DeviceZeroedAllocator>;
-using device_pinned_mempool_type = basic_mempool::MemPool<DevicePinnedAllocator>;
+using device_pinned_mempool_type =
+    basic_mempool::MemPool<DevicePinnedAllocator>;
 using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
 
 namespace detail
 {
 
 //! struct containing data necessary to coordinate kernel launches with reducers
-struct cudaInfo {
+struct cudaInfo
+{
   const void* func = nullptr;
-  cuda_dim_t gridDim{0, 0, 0};
-  cuda_dim_t blockDim{0, 0, 0};
+  cuda_dim_t gridDim {0, 0, 0};
+  cuda_dim_t blockDim {0, 0, 0};
   size_t* dynamic_smem = nullptr;
-  ::RAJA::resources::Cuda res{::RAJA::resources::Cuda::CudaFromStream(0,0)};
+  ::RAJA::resources::Cuda res {::RAJA::resources::Cuda::CudaFromStream(0, 0)};
   bool setup_reducers = false;
 };
-struct cudaStatusInfo : cudaInfo {
+struct cudaStatusInfo : cudaInfo
+{
 #if defined(RAJA_ENABLE_OPENMP)
   omp::mutex lock;
 #endif
@@ -190,10 +199,7 @@ extern cudaStatusInfo tl_status;
 extern std::unordered_map<cudaStream_t, bool> g_stream_info_map;
 
 RAJA_INLINE
-void synchronize_impl(::RAJA::resources::Cuda res)
-{
-  res.wait();
-}
+void synchronize_impl(::RAJA::resources::Cuda res) { res.wait(); }
 
 }  // namespace detail
 
@@ -205,13 +211,16 @@ void synchronize()
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   bool synchronize = false;
-  for (auto& val : detail::g_stream_info_map) {
-    if (!val.second) {
+  for (auto& val : detail::g_stream_info_map)
+  {
+    if (!val.second)
+    {
       synchronize = true;
-      val.second = true;
+      val.second  = true;
     }
   }
-  if (synchronize) {
+  if (synchronize)
+  {
     cudaErrchk(cudaDeviceSynchronize());
   }
 }
@@ -224,12 +233,16 @@ void synchronize(::RAJA::resources::Cuda res)
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   auto iter = detail::g_stream_info_map.find(res.get_stream());
-  if (iter != detail::g_stream_info_map.end()) {
-    if (!iter->second) {
+  if (iter != detail::g_stream_info_map.end())
+  {
+    if (!iter->second)
+    {
       iter->second = true;
       detail::synchronize_impl(res);
     }
-  } else {
+  }
+  else
+  {
     RAJA_ABORT_OR_THROW("Cannot synchronize unknown resource.");
   }
 }
@@ -242,29 +255,40 @@ void launch(::RAJA::resources::Cuda res, bool async = true)
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   auto iter = detail::g_stream_info_map.find(res.get_stream());
-  if (iter != detail::g_stream_info_map.end()) {
+  if (iter != detail::g_stream_info_map.end())
+  {
     iter->second = !async;
-  } else {
+  }
+  else
+  {
     detail::g_stream_info_map.emplace(res.get_stream(), !async);
   }
-  if (!async) {
+  if (!async)
+  {
     detail::synchronize_impl(res);
   }
 }
 
 //! Launch kernel and indicate resource synchronization status
 RAJA_INLINE
-void launch(const void* func, cuda_dim_t gridDim, cuda_dim_t blockDim, void** args, size_t shmem,
-            ::RAJA::resources::Cuda res, bool async = true, const char *name = nullptr)
+void launch(const void* func,
+            cuda_dim_t gridDim,
+            cuda_dim_t blockDim,
+            void** args,
+            size_t shmem,
+            ::RAJA::resources::Cuda res,
+            bool async       = true,
+            const char* name = nullptr)
 {
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
-  if(name) nvtxRangePushA(name);
+  if (name) nvtxRangePushA(name);
 #else
   RAJA_UNUSED_VAR(name);
 #endif
-  cudaErrchk(cudaLaunchKernel(func, gridDim, blockDim, args, shmem, res.get_stream()));
+  cudaErrchk(
+      cudaLaunchKernel(func, gridDim, blockDim, args, shmem, res.get_stream()));
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
-  if(name) nvtxRangePop();
+  if (name) nvtxRangePop();
 #endif
   launch(res, async);
 }
@@ -283,9 +307,11 @@ cuda_dim_t currentGridDim() { return detail::tl_status.gridDim; }
 
 //! get grid size of current launch
 RAJA_INLINE
-cuda_dim_member_t currentGridSize() { return detail::tl_status.gridDim.x *
-                                             detail::tl_status.gridDim.y *
-                                             detail::tl_status.gridDim.z; }
+cuda_dim_member_t currentGridSize()
+{
+  return detail::tl_status.gridDim.x * detail::tl_status.gridDim.y *
+         detail::tl_status.gridDim.z;
+}
 
 //! get blockDim of current launch
 RAJA_INLINE
@@ -293,9 +319,11 @@ cuda_dim_t currentBlockDim() { return detail::tl_status.blockDim; }
 
 //! get block size of current launch
 RAJA_INLINE
-cuda_dim_member_t currentBlockSize() { return detail::tl_status.blockDim.x *
-                                              detail::tl_status.blockDim.y *
-                                              detail::tl_status.blockDim.z; }
+cuda_dim_member_t currentBlockSize()
+{
+  return detail::tl_status.blockDim.x * detail::tl_status.blockDim.y *
+         detail::tl_status.blockDim.z;
+}
 
 //! get dynamic shared memory usage for current launch
 RAJA_INLINE
@@ -310,7 +338,8 @@ size_t maxDynamicShmem()
   return func_attr.maxDynamicSharedSizeBytes;
 }
 
-constexpr size_t dynamic_smem_allocation_failure = std::numeric_limits<size_t>::max();
+constexpr size_t dynamic_smem_allocation_failure =
+    std::numeric_limits<size_t>::max();
 
 //! Allocate dynamic shared memory for current launch
 //
@@ -322,24 +351,27 @@ constexpr size_t dynamic_smem_allocation_failure = std::numeric_limits<size_t>::
 //  Returns an offset into dynamic shared memory aligned to align on success,
 //  or dynamic_smem_allocation_failure on failure. Note that asking for 0 memory
 //  takes the failure return path.
-template < typename T, typename GetNFromMax >
-RAJA_INLINE
-size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max, size_t align = alignof(T))
+template <typename T, typename GetNFromMax>
+RAJA_INLINE size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max,
+                                        size_t align = alignof(T))
 {
   const size_t unaligned_shmem = *detail::tl_status.dynamic_smem;
-  const size_t align_offset = ((unaligned_shmem % align) != size_t(0))
-      ? align - (unaligned_shmem % align)
-      : size_t(0);
-  const size_t aligned_shmem = unaligned_shmem + align_offset;
+  const size_t align_offset    = ((unaligned_shmem % align) != size_t(0))
+                                     ? align - (unaligned_shmem % align)
+                                     : size_t(0);
+  const size_t aligned_shmem   = unaligned_shmem + align_offset;
 
   const size_t max_shmem_bytes = maxDynamicShmem() - aligned_shmem;
-  const size_t n_bytes = sizeof(T) *
-      std::forward<GetNFromMax>(get_n_from_max)(max_shmem_bytes / sizeof(T));
+  const size_t n_bytes = sizeof(T) * std::forward<GetNFromMax>(get_n_from_max)(
+                                         max_shmem_bytes / sizeof(T));
 
-  if (size_t(0) < n_bytes && n_bytes <= max_shmem_bytes) {
+  if (size_t(0) < n_bytes && n_bytes <= max_shmem_bytes)
+  {
     *detail::tl_status.dynamic_smem = aligned_shmem + n_bytes;
     return aligned_shmem;
-  } else {
+  }
+  else
+  {
     return dynamic_smem_allocation_failure;
   }
 }
@@ -354,16 +386,17 @@ ::RAJA::resources::Cuda currentResource() { return detail::tl_status.res; }
 // their copy constructors. Both look at tl_status to setup per kernel launch
 // resources.
 template <typename LOOP_BODY>
-RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
-    const void* func,
-    cuda_dim_t gridDim,
-    cuda_dim_t blockDim,
-    size_t& dynamic_smem,
-    ::RAJA::resources::Cuda res,
-    LOOP_BODY&& loop_body)
+RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type
+make_launch_body(const void* func,
+                 cuda_dim_t gridDim,
+                 cuda_dim_t blockDim,
+                 size_t& dynamic_smem,
+                 ::RAJA::resources::Cuda res,
+                 LOOP_BODY&& loop_body)
 {
-  ::RAJA::detail::ScopedAssignment<detail::cudaInfo> info_sa(detail::tl_status,
-      detail::cudaInfo{func, gridDim, blockDim, &dynamic_smem, res, true});
+  ::RAJA::detail::ScopedAssignment<detail::cudaInfo> info_sa(
+      detail::tl_status,
+      detail::cudaInfo {func, gridDim, blockDim, &dynamic_smem, res, true});
 
   using return_type = typename std::remove_reference<LOOP_BODY>::type;
   return return_type(std::forward<LOOP_BODY>(loop_body));
@@ -378,7 +411,8 @@ static constexpr size_t cuda_occupancy_uninitialized_size_t =
 struct CudaFixedMaxBlocksData
 {
   int device_sm_per_device = cuda::device_prop().multiProcessorCount;
-  int device_max_threads_per_sm = cuda::device_prop().maxThreadsPerMultiProcessor;
+  int device_max_threads_per_sm =
+      cuda::device_prop().maxThreadsPerMultiProcessor;
 };
 
 //! Get the maximum theoretical occupancy of the device
@@ -394,25 +428,26 @@ CudaFixedMaxBlocksData cuda_max_blocks()
 struct CudaOccMaxBlocksThreadsData
 {
   size_t func_dynamic_shmem_per_block = cuda_occupancy_uninitialized_size_t;
-  int func_max_blocks_per_device = cuda_occupancy_uninitialized_int;
-  int func_max_threads_per_block = cuda_occupancy_uninitialized_int;
+  int func_max_blocks_per_device      = cuda_occupancy_uninitialized_int;
+  int func_max_threads_per_block      = cuda_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with unknown threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker) >
-RAJA_INLINE
-CudaOccMaxBlocksThreadsData cuda_occupancy_max_blocks_threads(const void* func,
-    size_t func_dynamic_shmem_per_block)
+template <typename RAJA_UNUSED_ARG(UniqueMarker)>
+RAJA_INLINE CudaOccMaxBlocksThreadsData
+cuda_occupancy_max_blocks_threads(const void* func,
+                                  size_t func_dynamic_shmem_per_block)
 {
   static thread_local CudaOccMaxBlocksThreadsData data;
 
-  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block)
+  {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
 
     cudaErrchk(cudaOccupancyMaxPotentialBlockSize(
-        &data.func_max_blocks_per_device, &data.func_max_threads_per_block, func, func_dynamic_shmem_per_block));
-
+        &data.func_max_blocks_per_device, &data.func_max_threads_per_block,
+        func, func_dynamic_shmem_per_block));
   }
 
   return data;
@@ -422,48 +457,50 @@ CudaOccMaxBlocksThreadsData cuda_occupancy_max_blocks_threads(const void* func,
 struct CudaOccMaxBlocksData : CudaFixedMaxBlocksData
 {
   size_t func_dynamic_shmem_per_block = cuda_occupancy_uninitialized_size_t;
-  int func_threads_per_block = cuda_occupancy_uninitialized_int;
-  int func_max_blocks_per_sm = cuda_occupancy_uninitialized_int;
+  int func_threads_per_block          = cuda_occupancy_uninitialized_int;
+  int func_max_blocks_per_sm          = cuda_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with compile time threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block >
-RAJA_INLINE
-CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func,
-    size_t func_dynamic_shmem_per_block)
+template <typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block>
+RAJA_INLINE CudaOccMaxBlocksData
+cuda_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block)
 {
   static thread_local CudaOccMaxBlocksData data;
 
-  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block)
+  {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
-    data.func_threads_per_block = func_threads_per_block;
+    data.func_threads_per_block       = func_threads_per_block;
 
     cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
-
+        &data.func_max_blocks_per_sm, func, func_threads_per_block,
+        func_dynamic_shmem_per_block));
   }
 
   return data;
 }
 
 //! Get the maximum occupancy of a kernel with runtime threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker) >
-RAJA_INLINE
-CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func,
-    size_t func_dynamic_shmem_per_block, int func_threads_per_block)
+template <typename RAJA_UNUSED_ARG(UniqueMarker)>
+RAJA_INLINE CudaOccMaxBlocksData
+cuda_occupancy_max_blocks(const void* func,
+                          size_t func_dynamic_shmem_per_block,
+                          int func_threads_per_block)
 {
   static thread_local CudaOccMaxBlocksData data;
 
-  if ( data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
-       data.func_threads_per_block != func_threads_per_block ) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
+      data.func_threads_per_block != func_threads_per_block)
+  {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
-    data.func_threads_per_block = func_threads_per_block;
+    data.func_threads_per_block       = func_threads_per_block;
 
     cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
-
+        &data.func_max_blocks_per_sm, func, func_threads_per_block,
+        func_dynamic_shmem_per_block));
   }
 
   return data;
@@ -496,14 +533,16 @@ CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func,
  *
  ******************************************************************************
  */
-template < typename IdxT, typename Concretizer, typename UniqueMarker>
+template <typename IdxT, typename Concretizer, typename UniqueMarker>
 struct ConcretizerImpl
 {
-  ConcretizerImpl(const void* func, size_t func_dynamic_shmem_per_block, IdxT len)
-    : m_func(func)
-    , m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block)
-    , m_len(len)
-  { }
+  ConcretizerImpl(const void* func,
+                  size_t func_dynamic_shmem_per_block,
+                  IdxT len)
+      : m_func(func),
+        m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block),
+        m_len(len)
+  {}
 
   IdxT get_max_block_size() const
   {
@@ -517,10 +556,14 @@ struct ConcretizerImpl
   IdxT get_block_size_to_fit_len(IdxT func_blocks_per_device) const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
-    if (func_threads_per_block <= func_max_threads_per_block) {
+    IdxT func_threads_per_block =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    if (func_threads_per_block <= func_max_threads_per_block)
+    {
       return func_threads_per_block;
-    } else {
+    }
+    else
+    {
       return IdxT(0);
     }
   }
@@ -528,7 +571,8 @@ struct ConcretizerImpl
   //! Get a grid size when block size is specified
   IdxT get_grid_size_to_fit_len(IdxT func_threads_per_block) const
   {
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
     return func_blocks_per_device;
   }
 
@@ -536,16 +580,17 @@ struct ConcretizerImpl
   auto get_block_and_grid_size_to_fit_len() const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
-    return std::make_pair(func_max_threads_per_block,
-                          func_blocks_per_device);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
   }
 
   //! Get a block size when grid size is specified
   IdxT get_block_size_to_fit_device(IdxT func_blocks_per_device) const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    IdxT func_threads_per_block =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
     return std::min(func_threads_per_block, func_max_threads_per_block);
   }
 
@@ -554,8 +599,10 @@ struct ConcretizerImpl
   {
     auto data = cuda_occupancy_max_blocks<UniqueMarker>(
         m_func, m_func_dynamic_shmem_per_block, func_threads_per_block);
-    IdxT func_max_blocks_per_device = Concretizer::template get_max_grid_size<IdxT>(data);
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    IdxT func_max_blocks_per_device =
+        Concretizer::template get_max_grid_size<IdxT>(data);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
     return std::min(func_blocks_per_device, func_max_blocks_per_device);
   }
 
@@ -563,9 +610,9 @@ struct ConcretizerImpl
   auto get_block_and_grid_size_to_fit_device() const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_blocks_per_device = this->get_grid_size_to_fit_device(func_max_threads_per_block);
-    return std::make_pair(func_max_threads_per_block,
-                          func_blocks_per_device);
+    IdxT func_blocks_per_device =
+        this->get_grid_size_to_fit_device(func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
   }
 
 private:
diff --git a/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
index f6269b36e4..5aeaba0883 100644
--- a/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
@@ -41,9 +41,9 @@ namespace cuda
 
 // global function that creates the value on the device using the
 // factory and writes it into a pinned ptr
-template < typename Factory >
-__global__ void get_value_global(
-    typename Factory::value_type* ptr, Factory factory)
+template <typename Factory>
+__global__ void get_value_global(typename Factory::value_type* ptr,
+                                 Factory factory)
 {
   *ptr = factory();
 }
@@ -52,8 +52,9 @@ __global__ void get_value_global(
 inline void* get_cached_value_ptr(size_t nbytes)
 {
   static size_t cached_nbytes = 0;
-  static void* ptr = nullptr;
-  if (nbytes > cached_nbytes) {
+  static void* ptr            = nullptr;
+  if (nbytes > cached_nbytes)
+  {
     cached_nbytes = 0;
     cudaErrchk(cudaFreeHost(ptr));
     cudaErrchk(cudaMallocHost(&ptr, nbytes));
@@ -73,7 +74,7 @@ inline std::mutex& get_value_mutex()
 // get the device function pointer by calling a global function to
 // write it into a pinned ptr, beware different instantiates of this
 // function may run concurrently
-template < typename Factory >
+template <typename Factory>
 inline auto get_value(Factory&& factory)
 {
   using value_type = typename std::decay_t<Factory>::value_type;
@@ -81,8 +82,9 @@ inline auto get_value(Factory&& factory)
 
   auto res = ::camp::resources::Cuda::get_default();
   auto ptr = static_cast<value_type*>(get_cached_value_ptr(sizeof(value_type)));
-  auto func = reinterpret_cast<const void*>(&get_value_global<std::decay_t<Factory>>);
-  void *args[] = {(void*)&ptr, (void*)&factory};
+  auto func =
+      reinterpret_cast<const void*>(&get_value_global<std::decay_t<Factory>>);
+  void* args[] = {(void*)&ptr, (void*)&factory};
   cudaErrchk(cudaLaunchKernel(func, 1, 1, args, 0, res.get_stream()));
   cudaErrchk(cudaStreamSynchronize(res.get_stream()));
 
@@ -91,7 +93,7 @@ inline auto get_value(Factory&& factory)
 
 // get the device function pointer and store it so it can be used
 // multiple times
-template < typename Factory >
+template <typename Factory>
 inline auto get_cached_value(Factory&& factory)
 {
   static auto value = get_value(std::forward<Factory>(factory));
@@ -101,17 +103,20 @@ inline auto get_cached_value(Factory&& factory)
 }  // namespace cuda
 
 /*!
-* Populate and return a Dispatcher object that can be used in device code
-*/
-template < typename T, typename Dispatcher_T, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async >
-inline const Dispatcher_T* get_Dispatcher(cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> const&)
+ * Populate and return a Dispatcher object that can be used in device code
+ */
+template <typename T,
+          typename Dispatcher_T,
+          size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async>
+inline const Dispatcher_T*
+get_Dispatcher(cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> const&)
 {
-  static Dispatcher_T dispatcher{
-        Dispatcher_T::template makeDispatcher<T>(
-          [](auto&& factory) {
-            return cuda::get_cached_value(
-                std::forward<decltype(factory)>(factory));
-          }) };
+  static Dispatcher_T dispatcher {Dispatcher_T::template makeDispatcher<T>(
+      [](auto&& factory) {
+        return cuda::get_cached_value(std::forward<decltype(factory)>(factory));
+      })};
   return &dispatcher;
 }
 
diff --git a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
index 41fe17c84a..3cf8e6408f 100644
--- a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
@@ -36,46 +36,48 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+          typename... Args>
+struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
     : WorkRunnerForallOrdered<
-        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+          RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+          RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+          RAJA::ordered,
+          DISPATCH_POLICY_T,
+          ALLOCATOR_T,
+          INDEX_T,
+          Args...>
 {
   using base = WorkRunnerForallOrdered<
-        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+      RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+      RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+      RAJA::ordered,
+      DISPATCH_POLICY_T,
+      ALLOCATOR_T,
+      INDEX_T,
+      Args...>;
   using base::base;
-  using IndexType = INDEX_T;
+  using IndexType       = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
 
   ///
   /// run the loops in the given work container in order using forall
   /// run all loops asynchronously and synchronize after is necessary
   ///
-  template < typename WorkContainer >
+  template <typename WorkContainer>
   per_run_storage run(WorkContainer const& storage,
-                      typename base::resource_type r, Args... args) const
+                      typename base::resource_type r,
+                      Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -83,8 +85,12 @@ struct WorkRunner<
     IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
 
     // Only synchronize if we had something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
-      if (!Async) { RAJA::cuda::synchronize(r); }
+    if (num_loops > 0 && BLOCK_SIZE > 0)
+    {
+      if (!Async)
+      {
+        RAJA::cuda::synchronize(r);
+      }
     }
 
     return run_storage;
@@ -95,46 +101,48 @@ struct WorkRunner<
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+          typename... Args>
+struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
     : WorkRunnerForallReverse<
-        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+          RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+          RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+          RAJA::reverse_ordered,
+          DISPATCH_POLICY_T,
+          ALLOCATOR_T,
+          INDEX_T,
+          Args...>
 {
   using base = WorkRunnerForallReverse<
-        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+      RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+      RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+      RAJA::reverse_ordered,
+      DISPATCH_POLICY_T,
+      ALLOCATOR_T,
+      INDEX_T,
+      Args...>;
   using base::base;
-  using IndexType = INDEX_T;
+  using IndexType       = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
 
   ///
   /// run the loops in the given work container in reverse order using forall
   /// run all loops asynchronously and synchronize after is necessary
   ///
-  template < typename WorkContainer >
+  template <typename WorkContainer>
   per_run_storage run(WorkContainer const& storage,
-                      typename base::resource_type r, Args... args) const
+                      typename base::resource_type r,
+                      Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -142,8 +150,12 @@ struct WorkRunner<
     IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
 
     // Only synchronize if we had something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
-      if (!Async) { RAJA::cuda::synchronize(r); }
+    if (num_loops > 0 && BLOCK_SIZE > 0)
+    {
+      if (!Async)
+      {
+        RAJA::cuda::synchronize(r);
+      }
     }
 
     return run_storage;
@@ -155,15 +167,17 @@ struct WorkRunner<
  * A body and segment holder for storing loops that will be executed
  * on the device
  */
-template <typename Segment_type, typename LoopBody,
-          typename index_type, typename ... Args>
+template <typename Segment_type,
+          typename LoopBody,
+          typename index_type,
+          typename... Args>
 struct HoldCudaDeviceXThreadblockLoop
 {
-  template < typename segment_in, typename body_in >
+  template <typename segment_in, typename body_in>
   HoldCudaDeviceXThreadblockLoop(segment_in&& segment, body_in&& body)
-    : m_segment(std::forward<segment_in>(segment))
-    , m_body(std::forward<body_in>(body))
-  { }
+      : m_segment(std::forward<segment_in>(segment)),
+        m_body(std::forward<body_in>(body))
+  {}
 
   RAJA_DEVICE RAJA_INLINE void operator()(Args... args) const
   {
@@ -171,10 +185,11 @@ struct HoldCudaDeviceXThreadblockLoop
     // TODO:: decide whether or not to privatize the loop body
     const index_type i_begin = threadIdx.x + blockIdx.x * blockDim.x;
     const index_type stride  = blockDim.x * gridDim.x;
-    const auto begin = m_segment.begin();
-    const auto end   = m_segment.end();
+    const auto begin         = m_segment.begin();
+    const auto end           = m_segment.end();
     const index_type len(end - begin);
-    for ( index_type i = i_begin; i < len; i += stride ) {
+    for (index_type i = i_begin; i < len; i += stride)
+    {
       m_body(begin[i], std::forward<Args>(args)...);
     }
   }
@@ -184,12 +199,12 @@ struct HoldCudaDeviceXThreadblockLoop
   LoopBody m_body;
 };
 
-template < size_t BLOCK_SIZE,
-           size_t BLOCKS_PER_SM,
-           typename StorageIter,
-           typename value_type,
-           typename index_type,
-           typename ... Args >
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          typename StorageIter,
+          typename value_type,
+          typename index_type,
+          typename... Args>
 __launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__
     void cuda_unordered_y_block_global(StorageIter iter, Args... args)
 {
@@ -206,36 +221,42 @@ __launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__
  * the x direction, with the number of threads in the x dimension determined
  * by the average number of iterates per loop
  */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunner<
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+    RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+    RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average,
+    DISPATCH_POLICY_T,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...>
 {
-  using exec_policy = RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
-  using order_policy = RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average;
+  using exec_policy =
+      RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
+  using order_policy = RAJA::policy::cuda::
+      unordered_cuda_loop_y_block_iter_x_threadblock_average;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using Allocator = ALLOCATOR_T;
-  using index_type = INDEX_T;
-  using resource_type = resources::Cuda;
+  using Allocator       = ALLOCATOR_T;
+  using index_type      = INDEX_T;
+  using resource_type   = resources::Cuda;
 
   // The type that will hold the segment and loop body in work storage
-  struct holder_type {
-    template < typename T >
+  struct holder_type
+  {
+    template <typename T>
     using type = HoldCudaDeviceXThreadblockLoop<
-        typename camp::at<T, camp::num<0>>::type, // ITERABLE
-        typename camp::at<T, camp::num<1>>::type, // LOOP_BODY
-        index_type, Args...>;
+        typename camp::at<T, camp::num<0>>::type,  // ITERABLE
+        typename camp::at<T, camp::num<1>>::type,  // LOOP_BODY
+        index_type,
+        Args...>;
   };
   ///
-  template < typename T >
+  template <typename T>
   using holder_type_t = typename holder_type::template type<T>;
 
   // The policy indicating where the call function is invoked
@@ -244,21 +265,25 @@ struct WorkRunner<
 
   // The Dispatcher policy with holder_types used internally to handle the
   // ranges and callables passed in by the user.
-  using dispatcher_holder_policy = dispatcher_transform_types_t<dispatch_policy, holder_type>;
+  using dispatcher_holder_policy =
+      dispatcher_transform_types_t<dispatch_policy, holder_type>;
 
-  using dispatcher_type = Dispatcher<Platform::cuda, dispatcher_holder_policy, RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>, Args...>;
+  using dispatcher_type =
+      Dispatcher<Platform::cuda,
+                 dispatcher_holder_policy,
+                 RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>,
+                 Args...>;
 
   WorkRunner() = default;
 
-  WorkRunner(WorkRunner const&) = delete;
+  WorkRunner(WorkRunner const&)            = delete;
   WorkRunner& operator=(WorkRunner const&) = delete;
 
-  WorkRunner(WorkRunner && o)
-    : m_total_iterations(o.m_total_iterations)
+  WorkRunner(WorkRunner&& o) : m_total_iterations(o.m_total_iterations)
   {
     o.m_total_iterations = 0;
   }
-  WorkRunner& operator=(WorkRunner && o)
+  WorkRunner& operator=(WorkRunner&& o)
   {
     m_total_iterations = o.m_total_iterations;
 
@@ -268,35 +293,41 @@ struct WorkRunner<
 
   // runner interfaces with storage to enqueue so the runner can get
   // information from the segment and loop at enqueue time
-  template < typename WorkContainer, typename Iterable, typename LoopBody >
-  inline void enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body)
+  template <typename WorkContainer, typename Iterable, typename LoopBody>
+  inline void
+  enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body)
   {
     using Iterator  = camp::decay<decltype(std::begin(iter))>;
     using LOOP_BODY = camp::decay<LoopBody>;
     using ITERABLE  = camp::decay<Iterable>;
-    using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+    using IndexType =
+        camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
     using holder = holder_type_t<camp::list<ITERABLE, LOOP_BODY>>;
 
-    // using true_value_type = typename WorkContainer::template true_value_type<holder>;
+    // using true_value_type = typename WorkContainer::template
+    // true_value_type<holder>;
 
     Iterator begin = std::begin(iter);
-    Iterator end = std::end(iter);
-    IndexType len = std::distance(begin, end);
+    Iterator end   = std::end(iter);
+    IndexType len  = std::distance(begin, end);
 
     // Only launch kernel if we have something to iterate over
-    if (len > 0 && BLOCK_SIZE > 0) {
+    if (len > 0 && BLOCK_SIZE > 0)
+    {
 
       m_total_iterations += len;
 
       //
-      // TODO: Privatize the loop_body, using make_launch_body to setup reductions
+      // TODO: Privatize the loop_body, using make_launch_body to setup
+      // reductions
       //
       // LOOP_BODY body = RAJA::cuda::make_launch_body(func,
-      //     gridSize, blockSize, shmem, stream, std::forward<LoopBody>(loop_body));
+      //     gridSize, blockSize, shmem, stream,
+      //     std::forward<LoopBody>(loop_body));
 
       storage.template emplace<holder>(
-          get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy{}),
+          get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy {}),
           std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
     }
   }
@@ -304,37 +335,44 @@ struct WorkRunner<
   // no extra storage required here
   using per_run_storage = int;
 
-  template < typename WorkContainer >
-  per_run_storage run(WorkContainer const& storage, resource_type r, Args... args) const
+  template <typename WorkContainer>
+  per_run_storage
+  run(WorkContainer const& storage, resource_type r, Args... args) const
   {
-    using Iterator  = camp::decay<decltype(std::begin(storage))>;
-    using IndexType = camp::decay<decltype(std::distance(std::begin(storage), std::end(storage)))>;
+    using Iterator   = camp::decay<decltype(std::begin(storage))>;
+    using IndexType  = camp::decay<decltype(std::distance(std::begin(storage),
+                                                          std::end(storage)))>;
     using value_type = typename WorkContainer::value_type;
 
-    per_run_storage run_storage{};
+    per_run_storage run_storage {};
 
-    auto func = cuda_unordered_y_block_global<BLOCK_SIZE, BLOCKS_PER_SM, Iterator, value_type, index_type, Args...>;
+    auto func =
+        cuda_unordered_y_block_global<BLOCK_SIZE, BLOCKS_PER_SM, Iterator,
+                                      value_type, index_type, Args...>;
 
     //
     // Compute the requested iteration space size
     //
-    Iterator begin = std::begin(storage);
-    Iterator end = std::end(storage);
+    Iterator begin      = std::begin(storage);
+    Iterator end        = std::end(storage);
     IndexType num_loops = std::distance(begin, end);
 
     // Only launch kernel if we have something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
+    if (num_loops > 0 && BLOCK_SIZE > 0)
+    {
 
-      index_type average_iterations = m_total_iterations / static_cast<index_type>(num_loops);
+      index_type average_iterations =
+          m_total_iterations / static_cast<index_type>(num_loops);
 
       //
       // Compute the number of blocks
       //
       constexpr index_type block_size = static_cast<index_type>(BLOCK_SIZE);
-      cuda_dim_t blockSize{static_cast<cuda_dim_member_t>(block_size), 1, 1};
-      cuda_dim_t gridSize{static_cast<cuda_dim_member_t>((average_iterations + block_size - 1) / block_size),
-                          static_cast<cuda_dim_member_t>(num_loops),
-                          1};
+      cuda_dim_t blockSize {static_cast<cuda_dim_member_t>(block_size), 1, 1};
+      cuda_dim_t gridSize {
+          static_cast<cuda_dim_member_t>((average_iterations + block_size - 1) /
+                                         block_size),
+          static_cast<cuda_dim_member_t>(num_loops), 1};
 
       RAJA_FT_BEGIN;
 
@@ -347,8 +385,9 @@ struct WorkRunner<
         //
         // Launch the kernel
         //
-        void* func_args[] = { (void*)&begin, (void*)&args... };
-        RAJA::cuda::launch((const void*)func, gridSize, blockSize, func_args, shmem, r, Async);
+        void* func_args[] = {(void*)&begin, (void*)&args...};
+        RAJA::cuda::launch((const void*)func, gridSize, blockSize, func_args,
+                           shmem, r, Async);
       }
 
       RAJA_FT_END;
@@ -358,10 +397,7 @@ struct WorkRunner<
   }
 
   // clear any state so ready to be destroyed or reused
-  void clear()
-  {
-    m_total_iterations = 0;
-  }
+  void clear() { m_total_iterations = 0; }
 
 private:
   index_type m_total_iterations = 0;
diff --git a/include/RAJA/policy/cuda/atomic.hpp b/include/RAJA/policy/cuda/atomic.hpp
index aedfe91a03..a1b3cd5279 100644
--- a/include/RAJA/policy/cuda/atomic.hpp
+++ b/include/RAJA/policy/cuda/atomic.hpp
@@ -25,7 +25,8 @@
 #include <stdexcept>
 #include <type_traits>
 
-#if __CUDA__ARCH__ >= 600 && __CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 6
+#if __CUDA__ARCH__ >= 600 && __CUDACC_VER_MAJOR__ >= 11 &&                     \
+    __CUDACC_VER_MINOR__ >= 6
 #define RAJA_ENABLE_CUDA_ATOMIC_REF
 #endif
 
@@ -65,11 +66,11 @@ namespace detail
  * cuda_useBuiltinExchange below.
  */
 template <typename T>
-struct cuda_useBuiltinCommon {
-  static constexpr bool value =
-    std::is_same<T, int>::value ||
-    std::is_same<T, unsigned int>::value ||
-    std::is_same<T, unsigned long long>::value;
+struct cuda_useBuiltinCommon
+{
+  static constexpr bool value = std::is_same<T, int>::value ||
+                                std::is_same<T, unsigned int>::value ||
+                                std::is_same<T, unsigned long long>::value;
 };
 
 
@@ -81,15 +82,15 @@ struct cuda_useBuiltinCommon {
  * below.
  */
 template <typename T>
-struct cuda_useReinterpretCommon {
-  static constexpr bool value =
-    !cuda_useBuiltinCommon<T>::value &&
-    (sizeof(T) == sizeof(unsigned int) ||
-     sizeof(T) == sizeof(unsigned long long));
+struct cuda_useReinterpretCommon
+{
+  static constexpr bool value = !cuda_useBuiltinCommon<T>::value &&
+                                (sizeof(T) == sizeof(unsigned int) ||
+                                 sizeof(T) == sizeof(unsigned long long));
 
-  using type =
-    std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                       unsigned int, unsigned long long>;
+  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                  unsigned int,
+                                  unsigned long long>;
 };
 
 
@@ -109,7 +110,7 @@ using cuda_useReinterpretCommon_t = typename cuda_useReinterpretCommon<T>::type;
  */
 template <typename T,
           std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicOr(T *acc, T value)
+RAJA_INLINE __device__ T cuda_atomicOr(T* acc, T value)
 {
   return ::atomicOr(acc, value);
 }
@@ -124,12 +125,12 @@ RAJA_INLINE __device__ T cuda_atomicOr(T *acc, T value)
  * using a builtin
  */
 template <typename T>
-struct cuda_useBuiltinExchange {
-  static constexpr bool value =
-    std::is_same<T, int>::value ||
-    std::is_same<T, unsigned int>::value ||
-    std::is_same<T, unsigned long long>::value ||
-    std::is_same<T, float>::value;
+struct cuda_useBuiltinExchange
+{
+  static constexpr bool value = std::is_same<T, int>::value ||
+                                std::is_same<T, unsigned int>::value ||
+                                std::is_same<T, unsigned long long>::value ||
+                                std::is_same<T, float>::value;
 };
 
 /*!
@@ -137,22 +138,23 @@ struct cuda_useBuiltinExchange {
  * by reinterpreting inputs to types that the builtin exchange supports
  */
 template <typename T>
-struct cuda_useReinterpretExchange {
-  static constexpr bool value =
-    !cuda_useBuiltinExchange<T>::value &&
-    (sizeof(T) == sizeof(unsigned int) ||
-     sizeof(T) == sizeof(unsigned long long));
+struct cuda_useReinterpretExchange
+{
+  static constexpr bool value = !cuda_useBuiltinExchange<T>::value &&
+                                (sizeof(T) == sizeof(unsigned int) ||
+                                 sizeof(T) == sizeof(unsigned long long));
 
-  using type =
-    std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                       unsigned int, unsigned long long>;
+  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                  unsigned int,
+                                  unsigned long long>;
 };
 
 /*!
  * Alias for determining the integral type of the same size as the given type
  */
 template <typename T>
-using cuda_useReinterpretExchange_t = typename cuda_useReinterpretExchange<T>::type;
+using cuda_useReinterpretExchange_t =
+    typename cuda_useReinterpretExchange<T>::type;
 
 /*!
  * Performs an atomic exchange using a builtin function. Stores the new value
@@ -160,7 +162,7 @@ using cuda_useReinterpretExchange_t = typename cuda_useReinterpretExchange<T>::t
  */
 template <typename T,
           std::enable_if_t<cuda_useBuiltinExchange<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicExchange(T *acc, T value)
+RAJA_INLINE __device__ T cuda_atomicExchange(T* acc, T value)
 {
   return ::atomicExch(acc, value);
 }
@@ -171,13 +173,12 @@ RAJA_INLINE __device__ T cuda_atomicExchange(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<cuda_useReinterpretExchange<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicExchange(T *acc, T value)
+RAJA_INLINE __device__ T cuda_atomicExchange(T* acc, T value)
 {
   using R = cuda_useReinterpretExchange_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(
-    cuda_atomicExchange(reinterpret_cast<R*>(acc),
-                        RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(cuda_atomicExchange(
+      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
@@ -187,41 +188,41 @@ RAJA_INLINE __device__ T cuda_atomicExchange(T *acc, T value)
 #if defined(RAJA_ENABLE_CUDA_ATOMIC_REF)
 
 template <typename T>
-RAJA_INLINE __device__ T cuda_atomicLoad(T *acc)
+RAJA_INLINE __device__ T cuda_atomicLoad(T* acc)
 {
   return cuda::atomic_ref<T, cuda::thread_scope_device>(*acc).load(
-    cuda::memory_order_relaxed{});
+      cuda::memory_order_relaxed {});
 }
 
 
 template <typename T>
-RAJA_INLINE __device__ void cuda_atomicStore(T *acc, T value)
+RAJA_INLINE __device__ void cuda_atomicStore(T* acc, T value)
 {
   cuda::atomic_ref<T, cuda::thread_scope_device>(*acc).store(
-    value, cuda::memory_order_relaxed{});
+      value, cuda::memory_order_relaxed {});
 }
 
 #else
 
 template <typename T,
           std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicLoad(T *acc)
+RAJA_INLINE __device__ T cuda_atomicLoad(T* acc)
 {
   return cuda_atomicOr(acc, static_cast<T>(0));
 }
 
 template <typename T,
           std::enable_if_t<cuda_useReinterpretCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicLoad(T *acc)
+RAJA_INLINE __device__ T cuda_atomicLoad(T* acc)
 {
   using R = cuda_useReinterpretCommon_t<T>;
 
   return RAJA::util::reinterp_A_as_B<R, T>(
-    cuda_atomicLoad(reinterpret_cast<R*>(acc)));
+      cuda_atomicLoad(reinterpret_cast<R*>(acc)));
 }
 
 template <typename T>
-RAJA_INLINE __device__ void cuda_atomicStore(T *acc, T value)
+RAJA_INLINE __device__ void cuda_atomicStore(T* acc, T value)
 {
   cuda_atomicExchange(acc, value);
 }
@@ -238,14 +239,14 @@ RAJA_INLINE __device__ void cuda_atomicStore(T *acc, T value)
  * implemented using a builtin
  */
 template <typename T>
-struct cuda_useBuiltinCAS {
+struct cuda_useBuiltinCAS
+{
   static constexpr bool value =
 #if __CUDA_ARCH__ >= 700
-    std::is_same<T, unsigned short int>::value ||
+      std::is_same<T, unsigned short int>::value ||
 #endif
-    std::is_same<T, int>::value ||
-    std::is_same<T, unsigned int>::value ||
-    std::is_same<T, unsigned long long>::value;
+      std::is_same<T, int>::value || std::is_same<T, unsigned int>::value ||
+      std::is_same<T, unsigned long long>::value;
 };
 
 /*!
@@ -254,29 +255,28 @@ struct cuda_useBuiltinCAS {
  * and swap supports
  */
 template <typename T>
-struct cuda_useReinterpretCAS {
-  static constexpr bool value =
-    !cuda_useBuiltinCAS<T>::value &&
-    (
+struct cuda_useReinterpretCAS
+{
+  static constexpr bool value = !cuda_useBuiltinCAS<T>::value &&
+                                (
 #if __CUDA_ARCH__ >= 700
-     sizeof(T) == sizeof(unsigned short) ||
+                                    sizeof(T) == sizeof(unsigned short) ||
 #endif
-     sizeof(T) == sizeof(unsigned int) ||
-     sizeof(T) == sizeof(unsigned long long)
-    );
+                                    sizeof(T) == sizeof(unsigned int) ||
+                                    sizeof(T) == sizeof(unsigned long long));
 
   using type =
 #if __CUDA_ARCH__ >= 700
-    std::conditional_t<sizeof(T) == sizeof(unsigned short),
-                       unsigned short,
+      std::conditional_t<sizeof(T) == sizeof(unsigned short),
+                         unsigned short,
 #endif
-    std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                       unsigned int,
-                       unsigned long long>
+                         std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                            unsigned int,
+                                            unsigned long long>
 #if __CUDA_ARCH__ >= 700
-                      >
+                         >
 #endif
-    ;
+      ;
 };
 
 /*!
@@ -287,21 +287,20 @@ using cuda_useReinterpretCAS_t = typename cuda_useReinterpretCAS<T>::type;
 
 template <typename T,
           std::enable_if_t<cuda_useBuiltinCAS<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicCAS(T *acc, T compare, T value)
+RAJA_INLINE __device__ T cuda_atomicCAS(T* acc, T compare, T value)
 {
   return ::atomicCAS(acc, compare, value);
 }
 
 template <typename T,
           std::enable_if_t<cuda_useReinterpretCAS<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicCAS(T *acc, T compare, T value)
+RAJA_INLINE __device__ T cuda_atomicCAS(T* acc, T compare, T value)
 {
   using R = cuda_useReinterpretCAS_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(
-    cuda_atomicCAS(reinterpret_cast<R*>(acc),
-                   RAJA::util::reinterp_A_as_B<T, R>(compare),
-                   RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(cuda_atomicCAS(
+      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(compare),
+      RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 /*!
@@ -334,42 +333,44 @@ RAJA_INLINE __device__ bool cuda_atomicCAS_equal(const T& a, const T& b)
  * operation.
  */
 template <typename T, typename Oper>
-RAJA_INLINE __device__ T cuda_atomicCAS_loop(T *acc,
-                                             Oper&& oper)
+RAJA_INLINE __device__ T cuda_atomicCAS_loop(T* acc, Oper&& oper)
 {
   T old = cuda_atomicLoad(acc);
   T expected;
 
-  do {
+  do
+  {
     expected = old;
-    old = cuda_atomicCAS(acc, expected, oper(expected));
+    old      = cuda_atomicCAS(acc, expected, oper(expected));
   } while (!cuda_atomicCAS_equal(old, expected));
 
   return old;
 }
 
 /*!
- * Generic impementation of any atomic 32-bit or 64-bit operator with short-circuiting.
- * Implementation uses the existing CUDA supplied unsigned 32-bit or 64-bit CAS
- * operator. Returns the OLD value that was replaced by the result of this
- * operation.
+ * Generic impementation of any atomic 32-bit or 64-bit operator with
+ * short-circuiting. Implementation uses the existing CUDA supplied unsigned
+ * 32-bit or 64-bit CAS operator. Returns the OLD value that was replaced by the
+ * result of this operation.
  */
 template <typename T, typename Oper, typename ShortCircuit>
-RAJA_INLINE __device__ T cuda_atomicCAS_loop(T *acc,
+RAJA_INLINE __device__ T cuda_atomicCAS_loop(T* acc,
                                              Oper&& oper,
                                              ShortCircuit&& sc)
 {
   T old = cuda_atomicLoad(acc);
 
-  if (sc(old)) {
+  if (sc(old))
+  {
     return old;
   }
 
   T expected;
 
-  do {
+  do
+  {
     expected = old;
-    old = cuda_atomicCAS(acc, expected, oper(expected));
+    old      = cuda_atomicCAS(acc, expected, oper(expected));
   } while (!cuda_atomicCAS_equal(old, expected) && !sc(old));
 
   return old;
@@ -379,29 +380,28 @@ RAJA_INLINE __device__ T cuda_atomicCAS_loop(T *acc,
 /*!
  * Atomic addition
  */
-using cuda_atomicAdd_builtin_types = ::camp::list<
-  int,
-  unsigned int,
-  unsigned long long int,
-  float
+using cuda_atomicAdd_builtin_types = ::camp::list<int,
+                                                  unsigned int,
+                                                  unsigned long long int,
+                                                  float
 #if __CUDA_ARCH__ >= 600
-  ,
-  double
+                                                  ,
+                                                  double
 #endif
->;
+                                                  >;
 
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicAdd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicAdd(T *acc, T value)
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicAdd_builtin_types>* =
+              nullptr>
+RAJA_INLINE __device__ T cuda_atomicAdd(T* acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
-    return old + value;
-  });
+  return cuda_atomicCAS_loop(acc, [value](T old) { return old + value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicAdd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicAdd(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicAdd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicAdd(T* acc, T value)
 {
   return ::atomicAdd(acc, value);
 }
@@ -412,39 +412,39 @@ RAJA_INLINE __device__ T cuda_atomicAdd(T *acc, T value)
  */
 using cuda_atomicSub_builtin_types = cuda_atomicAdd_builtin_types;
 
-using cuda_atomicSub_via_Sub_builtin_types = ::camp::list<
-  int,
-  unsigned int
->;
+using cuda_atomicSub_via_Sub_builtin_types = ::camp::list<int, unsigned int>;
 
-using cuda_atomicSub_via_Add_builtin_types = ::camp::list<
-  unsigned long long int,
-  float
+using cuda_atomicSub_via_Add_builtin_types =
+    ::camp::list<unsigned long long int,
+                 float
 #if __CUDA_ARCH__ >= 600
-  ,
-  double
+                 ,
+                 double
 #endif
->;
+                 >;
 
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicSub_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicSub(T *acc, T value)
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicSub_builtin_types>* =
+              nullptr>
+RAJA_INLINE __device__ T cuda_atomicSub(T* acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
-    return old - value;
-  });
+  return cuda_atomicCAS_loop(acc, [value](T old) { return old - value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicSub_via_Sub_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicSub(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicSub_via_Sub_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicSub(T* acc, T value)
 {
   return ::atomicSub(acc, value);
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicSub_via_Add_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicSub(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicSub_via_Add_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicSub(T* acc, T value)
 {
   return ::atomicAdd(acc, -value);
 }
@@ -453,37 +453,34 @@ RAJA_INLINE __device__ T cuda_atomicSub(T *acc, T value)
 /*!
  * Atomic min/max
  */
-using cuda_atomicMinMax_builtin_types = ::camp::list<
-  int,
-  unsigned int
+using cuda_atomicMinMax_builtin_types = ::camp::list<int,
+                                                     unsigned int
 #if __CUDA_ARCH__ >= 500
-  ,
-  long long int,
-  unsigned long long int
+                                                     ,
+                                                     long long int,
+                                                     unsigned long long int
 #endif
->;
+                                                     >;
 
 
 /*!
  * Atomic min
  */
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicMinMax_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicMin(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, cuda_atomicMinMax_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicMin(T* acc, T value)
 {
   return cuda_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return value < old ? value : old;
-    },
-    [value] (T current) {
-      return current <= value;
-    });
+      acc, [value](T old) { return value < old ? value : old; },
+      [value](T current) { return current <= value; });
 }
 
 template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicMin(T *acc, T value)
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* =
+              nullptr>
+RAJA_INLINE __device__ T cuda_atomicMin(T* acc, T value)
 {
   return ::atomicMin(acc, value);
 }
@@ -492,23 +489,21 @@ RAJA_INLINE __device__ T cuda_atomicMin(T *acc, T value)
 /*!
  * Atomic max
  */
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicMinMax_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicMax(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, cuda_atomicMinMax_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicMax(T* acc, T value)
 {
   return cuda_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return old < value ? value : old;
-    },
-    [value] (T current) {
-      return value <= current;
-    });
+      acc, [value](T old) { return old < value ? value : old; },
+      [value](T current) { return value <= current; });
 }
 
 template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicMax(T *acc, T value)
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* =
+              nullptr>
+RAJA_INLINE __device__ T cuda_atomicMax(T* acc, T value)
 {
   return ::atomicMax(acc, value);
 }
@@ -517,28 +512,30 @@ RAJA_INLINE __device__ T cuda_atomicMax(T *acc, T value)
 /*!
  * Atomic increment/decrement with reset
  */
-using cuda_atomicIncDecReset_builtin_types = ::camp::list<
-  unsigned int
->;
+using cuda_atomicIncDecReset_builtin_types = ::camp::list<unsigned int>;
 
 
 /*!
  * Atomic increment with reset
  */
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicIncDecReset_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicInc(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, cuda_atomicIncDecReset_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicInc(T* acc, T value)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
-    return value <= old ? static_cast<T>(0) : old + static_cast<T>(1);
-  });
+  return cuda_atomicCAS_loop(
+      acc, [value](T old)
+      { return value <= old ? static_cast<T>(0) : old + static_cast<T>(1); });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicIncDecReset_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicInc(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicIncDecReset_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicInc(T* acc, T value)
 {
   return ::atomicInc(acc, value);
 }
@@ -548,7 +545,7 @@ RAJA_INLINE __device__ T cuda_atomicInc(T *acc, T value)
  * Atomic increment (implemented in terms of atomic addition)
  */
 template <typename T>
-RAJA_INLINE __device__ T cuda_atomicInc(T *acc)
+RAJA_INLINE __device__ T cuda_atomicInc(T* acc)
 {
   return cuda_atomicAdd(acc, static_cast<T>(1));
 }
@@ -557,20 +554,28 @@ RAJA_INLINE __device__ T cuda_atomicInc(T *acc)
 /*!
  * Atomic decrement with reset
  */
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicIncDecReset_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicDec(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, cuda_atomicIncDecReset_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicDec(T* acc, T value)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
-    return old == static_cast<T>(0) || value < old ? value : old - static_cast<T>(1);
-  });
+  return cuda_atomicCAS_loop(acc,
+                             [value](T old)
+                             {
+                               return old == static_cast<T>(0) || value < old
+                                          ? value
+                                          : old - static_cast<T>(1);
+                             });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicIncDecReset_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicDec(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicIncDecReset_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicDec(T* acc, T value)
 {
   return ::atomicDec(acc, value);
 }
@@ -580,7 +585,7 @@ RAJA_INLINE __device__ T cuda_atomicDec(T *acc, T value)
  * Atomic decrement (implemented in terms of atomic subtraction)
  */
 template <typename T>
-RAJA_INLINE __device__ T cuda_atomicDec(T *acc)
+RAJA_INLINE __device__ T cuda_atomicDec(T* acc)
 {
   return cuda_atomicSub(acc, static_cast<T>(1));
 }
@@ -589,28 +594,25 @@ RAJA_INLINE __device__ T cuda_atomicDec(T *acc)
 /*!
  * Atomic bitwise functions (and, or, xor)
  */
-using cuda_atomicBit_builtin_types = ::camp::list<
-  int,
-  unsigned int,
-  unsigned long long int
->;
+using cuda_atomicBit_builtin_types =
+    ::camp::list<int, unsigned int, unsigned long long int>;
 
 
 /*!
  * Atomic and
  */
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicAnd(T *acc, T value)
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* =
+              nullptr>
+RAJA_INLINE __device__ T cuda_atomicAnd(T* acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
-    return old & value;
-  });
+  return cuda_atomicCAS_loop(acc, [value](T old) { return old & value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicBit_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicAnd(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicBit_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicAnd(T* acc, T value)
 {
   return ::atomicAnd(acc, value);
 }
@@ -620,12 +622,11 @@ RAJA_INLINE __device__ T cuda_atomicAnd(T *acc, T value)
  * Atomic or
  */
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicOr(T *acc, T value)
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* =
+              nullptr>
+RAJA_INLINE __device__ T cuda_atomicOr(T* acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
-    return old | value;
-  });
+  return cuda_atomicCAS_loop(acc, [value](T old) { return old | value; });
 }
 
 /*!
@@ -638,17 +639,17 @@ RAJA_INLINE __device__ T cuda_atomicOr(T *acc, T value)
  * Atomic xor
  */
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicXor(T *acc, T value)
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* =
+              nullptr>
+RAJA_INLINE __device__ T cuda_atomicXor(T* acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
-    return old ^ value;
-  });
+  return cuda_atomicCAS_loop(acc, [value](T old) { return old ^ value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicBit_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicXor(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicBit_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicXor(T* acc, T value)
 {
   return ::atomicXor(acc, value);
 }
@@ -667,185 +668,195 @@ RAJA_INLINE __device__ T cuda_atomicXor(T *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicLoad(cuda_atomic_explicit<host_policy>, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(cuda_atomic_explicit<host_policy>,
+                                          T* acc)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicLoad(acc);
 #else
-  return RAJA::atomicLoad(host_policy{}, acc);
+  return RAJA::atomicLoad(host_policy {}, acc);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE void
-atomicStore(cuda_atomic_explicit<host_policy>, T *acc, T value)
+atomicStore(cuda_atomic_explicit<host_policy>, T* acc, T value)
 {
 #ifdef __CUDA_ARCH__
   detail::cuda_atomicStore(acc, value);
 #else
-  RAJA::atomicStore(host_policy{}, acc, value);
+  RAJA::atomicStore(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicAdd(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicAdd(acc, value);
 #else
-  return RAJA::atomicAdd(host_policy{}, acc, value);
+  return RAJA::atomicAdd(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicSub(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicSub(acc, value);
 #else
-  return RAJA::atomicSub(host_policy{}, acc, value);
+  return RAJA::atomicSub(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicMin(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicMin(acc, value);
 #else
-  return RAJA::atomicMin(host_policy{}, acc, value);
+  return RAJA::atomicMin(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicMax(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicMax(acc, value);
 #else
-  return RAJA::atomicMax(host_policy{}, acc, value);
+  return RAJA::atomicMax(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicInc(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
   return detail::cuda_atomicInc(acc, value);
 #else
-  return RAJA::atomicInc(host_policy{}, acc, value);
+  return RAJA::atomicInc(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicInc(cuda_atomic_explicit<host_policy>, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(cuda_atomic_explicit<host_policy>,
+                                         T* acc)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicInc(acc);
 #else
-  return RAJA::atomicInc(host_policy{}, acc);
+  return RAJA::atomicInc(host_policy {}, acc);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicDec(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
   return detail::cuda_atomicDec(acc, value);
 #else
-  return RAJA::atomicDec(host_policy{}, acc, value);
+  return RAJA::atomicDec(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicDec(cuda_atomic_explicit<host_policy>, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(cuda_atomic_explicit<host_policy>,
+                                         T* acc)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicDec(acc);
 #else
-  return RAJA::atomicDec(host_policy{}, acc);
+  return RAJA::atomicDec(host_policy {}, acc);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicAnd(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicAnd(acc, value);
 #else
-  return RAJA::atomicAnd(host_policy{}, acc, value);
+  return RAJA::atomicAnd(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicOr(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(cuda_atomic_explicit<host_policy>,
+                                        T* acc,
+                                        T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicOr(acc, value);
 #else
-  return RAJA::atomicOr(host_policy{}, acc, value);
+  return RAJA::atomicOr(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicXor(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicXor(acc, value);
 #else
-  return RAJA::atomicXor(host_policy{}, acc, value);
+  return RAJA::atomicXor(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicExchange(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(cuda_atomic_explicit<host_policy>,
+                                              T* acc,
+                                              T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicExchange(acc, value);
 #else
-  return RAJA::atomicExchange(host_policy{}, acc, value);
+  return RAJA::atomicExchange(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicCAS(cuda_atomic_explicit<host_policy>, T *acc, T compare, T value)
+atomicCAS(cuda_atomic_explicit<host_policy>, T* acc, T compare, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicCAS(acc, compare, value);
 #else
-  return RAJA::atomicCAS(host_policy{}, acc, compare, value);
+  return RAJA::atomicCAS(host_policy {}, acc, compare, value);
 #endif
 }
 
diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp
index 493136400c..490163c1a7 100644
--- a/include/RAJA/policy/cuda/forall.hpp
+++ b/include/RAJA/policy/cuda/forall.hpp
@@ -70,61 +70,91 @@ namespace impl
  *
  ******************************************************************************
  */
-template<typename IterationMapping, typename IterationGetter, typename Concretizer, typename UniqueMarker>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          typename UniqueMarker>
 struct ForallDimensionCalculator;
 
 // The general cases handle fixed BLOCK_SIZE > 0 and/or GRID_SIZE > 0
 // there are specializations for named_usage::unspecified
 // but named_usage::ignored is not supported so no specializations are provided
 // and static_asserts in the general case catch unsupported values
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or named_usage::unspecified with forall");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or named_usage::unspecified with forall");
 
   using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* RAJA_UNUSED_ARG(func),
+                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
 
-    if ( len > (block_size * grid_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > (block_size * grid_size))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
 
-    internal::set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexGetter::block_size));
-    internal::set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexGetter::grid_size));
+    internal::set_cuda_dim<dim>(dims.threads,
+                                static_cast<IdxT>(IndexGetter::block_size));
+    internal::set_cuda_dim<dim>(dims.blocks,
+                                static_cast<IdxT>(IndexGetter::grid_size));
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
     const IdxT block_size = concretizer.get_block_size_to_fit_len(grid_size);
 
-    if ( block_size == IdxT(0) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (block_size == IdxT(0))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
 
     internal::set_cuda_dim<dim>(dims.threads, block_size);
@@ -132,43 +162,59 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = concretizer.get_grid_size_to_fit_len(block_size);
+    const IdxT grid_size  = concretizer.get_grid_size_to_fit_len(block_size);
 
     internal::set_cuda_dim<dim>(dims.threads, block_size);
     internal::set_cuda_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::cuda::
+        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  using IndexGetter = ::RAJA::cuda::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
     const auto sizes = concretizer.get_block_and_grid_size_to_fit_len();
 
@@ -177,46 +223,67 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or named_usage::unspecified with forall");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or named_usage::unspecified with forall");
 
   using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT RAJA_UNUSED_ARG(len),
-                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT RAJA_UNUSED_ARG(len),
+                             const void* RAJA_UNUSED_ARG(func),
+                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
 
     internal::set_cuda_dim<dim>(dims.threads, block_size);
     internal::set_cuda_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
     const IdxT block_size = concretizer.get_block_size_to_fit_device(grid_size);
 
     internal::set_cuda_dim<dim>(dims.threads, block_size);
@@ -224,43 +291,59 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_us
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = concretizer.get_grid_size_to_fit_device(block_size);
+    const IdxT grid_size  = concretizer.get_grid_size_to_fit_device(block_size);
 
     internal::set_cuda_dim<dim>(dims.threads, block_size);
     internal::set_cuda_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::cuda::
+        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  using IndexGetter = ::RAJA::cuda::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
     const auto sizes = concretizer.get_block_and_grid_size_to_fit_device();
 
@@ -290,21 +373,22 @@ template <typename EXEC_POL,
           typename LOOP_BODY,
           typename IndexType,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size > 0),
+                           size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-void forall_cuda_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+    void forall_cuda_kernel(LOOP_BODY loop_body,
+                            const Iterator idx,
+                            IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
-  if (ii < length) {
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
+  if (ii < length)
+  {
     body(idx[ii]);
   }
 }
@@ -315,21 +399,20 @@ template <typename EXEC_POL,
           typename LOOP_BODY,
           typename IndexType,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forall_cuda_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size <= 0),
+                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void
+forall_cuda_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
-  if (ii < length) {
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
+  if (ii < length)
+  {
     body(idx[ii]);
   }
 }
@@ -341,23 +424,24 @@ template <typename EXEC_POL,
           typename IndexType,
           typename ForallParam,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size > 0),
+                           size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-void forallp_cuda_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+    void forallp_cuda_kernel(LOOP_BODY loop_body,
+                             const Iterator idx,
+                             IndexType length,
+                             ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
-  if ( ii < length ) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
+  if (ii < length)
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
@@ -369,138 +453,143 @@ template <typename EXEC_POL,
           typename IndexType,
           typename ForallParam,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forallp_cuda_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size <= 0),
+                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forallp_cuda_kernel(LOOP_BODY loop_body,
+                                    const Iterator idx,
+                                    IndexType length,
+                                    ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
-  if ( ii < length ) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
+  if (ii < length)
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 
-template <typename EXEC_POL,
-          size_t BlocksPerSM,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+template <
+    typename EXEC_POL,
+    size_t BlocksPerSM,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename IterationMapping          = typename EXEC_POL::IterationMapping,
+    typename IterationGetter           = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size > 0),
+                     size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-void forall_cuda_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+    void forall_cuda_kernel(LOOP_BODY loop_body,
+                            const Iterator idx,
+                            IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
+  auto& body      = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
     body(idx[ii]);
   }
 }
 ///
-template <typename EXEC_POL,
-          size_t BlocksPerSM,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forall_cuda_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+template <
+    typename EXEC_POL,
+    size_t BlocksPerSM,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter  = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size <= 0),
+                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void
+forall_cuda_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
+  auto& body      = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
     body(idx[ii]);
   }
 }
 
 ///
-template <typename EXEC_POL,
-          size_t BlocksPerSM,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename ForallParam,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+template <
+    typename EXEC_POL,
+    size_t BlocksPerSM,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename ForallParam,
+    typename IterationMapping          = typename EXEC_POL::IterationMapping,
+    typename IterationGetter           = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size > 0),
+                     size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-void forallp_cuda_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+    void forallp_cuda_kernel(LOOP_BODY loop_body,
+                             const Iterator idx,
+                             IndexType length,
+                             ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  auto& body      = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 ///
-template <typename EXEC_POL,
-          size_t BlocksPerSM,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename ForallParam,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forallp_cuda_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+template <
+    typename EXEC_POL,
+    size_t BlocksPerSM,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename ForallParam,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter  = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size <= 0),
+                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forallp_cuda_kernel(LOOP_BODY loop_body,
+                                    const Iterator idx,
+                                    IndexType length,
+                                    ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  auto& body      = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
@@ -515,37 +604,50 @@ void forallp_cuda_kernel(LOOP_BODY loop_body,
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable, typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BlocksPerSM, bool Async,
+template <typename Iterable,
+          typename LoopBody,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BlocksPerSM,
+          bool Async,
           typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Cuda>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Cuda cuda_res,
-            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>const&,
+            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                     IterationGetter,
+                                                     Concretizer,
+                                                     BlocksPerSM,
+                                                     Async> const&,
             Iterable&& iter,
             LoopBody&& loop_body,
             ForallParam)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
-  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<
+      IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
+  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter,
+                                    LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator =
+      impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
+                                      Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0) {
+  if (len > 0)
+  {
 
     auto func = reinterpret_cast<const void*>(
         &impl::forall_cuda_kernel<EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY,
@@ -568,14 +670,16 @@ forall_impl(resources::Cuda cuda_res,
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body = RAJA::cuda::make_launch_body(func,
-          dims.blocks, dims.threads, shmem, cuda_res, std::forward<LoopBody>(loop_body));
+      LOOP_BODY body = RAJA::cuda::make_launch_body(
+          func, dims.blocks, dims.threads, shmem, cuda_res,
+          std::forward<LoopBody>(loop_body));
 
       //
       // Launch the kernels
       //
-      void *args[] = {(void*)&body, (void*)&begin, (void*)&len};
-      RAJA::cuda::launch(func, dims.blocks, dims.threads, args, shmem, cuda_res, Async);
+      void* args[] = {(void*)&body, (void*)&begin, (void*)&len};
+      RAJA::cuda::launch(func, dims.blocks, dims.threads, args, shmem, cuda_res,
+                         Async);
     }
 
     RAJA_FT_END;
@@ -585,41 +689,56 @@ forall_impl(resources::Cuda cuda_res,
 }
 
 
-template <typename Iterable, typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BlocksPerSM, bool Async,
+template <typename Iterable,
+          typename LoopBody,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BlocksPerSM,
+          bool Async,
           typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Cuda>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(resources::Cuda cuda_res,
-            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async> const&,
+            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                     IterationGetter,
+                                                     Concretizer,
+                                                     BlocksPerSM,
+                                                     Async> const&,
             Iterable&& iter,
             LoopBody&& loop_body,
             ForallParam f_params)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
-  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, camp::num<BlocksPerSM>, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<
+      IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
+  using UniqueMarker =
+      ::camp::list<IterationMapping, IterationGetter, camp::num<BlocksPerSM>,
+                   LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator =
+      impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
+                                      Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0) {
+  if (len > 0)
+  {
 
     auto func = reinterpret_cast<const void*>(
-        &impl::forallp_cuda_kernel< EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY,
-                                   IndexType, camp::decay<ForallParam> >);
+        &impl::forallp_cuda_kernel<EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY,
+                                   IndexType, camp::decay<ForallParam>>);
 
     //
     // Setup shared memory buffers
@@ -635,9 +754,9 @@ forall_impl(resources::Cuda cuda_res,
     RAJA_FT_BEGIN;
 
     RAJA::cuda::detail::cudaInfo launch_info;
-    launch_info.gridDim = dims.blocks;
+    launch_info.gridDim  = dims.blocks;
     launch_info.blockDim = dims.threads;
-    launch_info.res = cuda_res;
+    launch_info.res      = cuda_res;
 
     {
       RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params, launch_info);
@@ -645,14 +764,17 @@ forall_impl(resources::Cuda cuda_res,
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body = RAJA::cuda::make_launch_body(func,
-          dims.blocks, dims.threads, shmem, cuda_res, std::forward<LoopBody>(loop_body));
+      LOOP_BODY body = RAJA::cuda::make_launch_body(
+          func, dims.blocks, dims.threads, shmem, cuda_res,
+          std::forward<LoopBody>(loop_body));
 
       //
       // Launch the kernels
       //
-      void *args[] = {(void*)&body, (void*)&begin, (void*)&len, (void*)&f_params};
-      RAJA::cuda::launch(func, dims.blocks, dims.threads, args, shmem, cuda_res, Async);
+      void* args[] = {(void*)&body, (void*)&begin, (void*)&len,
+                      (void*)&f_params};
+      RAJA::cuda::launch(func, dims.blocks, dims.threads, args, shmem, cuda_res,
+                         Async);
 
       RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params, launch_info);
     }
@@ -683,22 +805,32 @@ forall_impl(resources::Cuda cuda_res,
  ******************************************************************************
  */
 template <typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BlocksPerSM, bool Async,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BlocksPerSM,
+          bool Async,
           typename... SegmentTypes>
-RAJA_INLINE resources::EventProxy<resources::Cuda>
-forall_impl(resources::Cuda r,
-            ExecPolicy<seq_segit, ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>>,
-            const TypedIndexSet<SegmentTypes...>& iset,
-            LoopBody&& loop_body)
+RAJA_INLINE resources::EventProxy<resources::Cuda> forall_impl(
+    resources::Cuda r,
+    ExecPolicy<seq_segit,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BlocksPerSM,
+                                                        Async>>,
+    const TypedIndexSet<SegmentTypes...>& iset,
+    LoopBody&& loop_body)
 {
   int num_seg = iset.getNumSegments();
-  for (int isi = 0; isi < num_seg; ++isi) {
-    iset.segmentCall(r,
-                     isi,
-                     detail::CallForall(),
-                     ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, true>(),
-                     loop_body);
+  for (int isi = 0; isi < num_seg; ++isi)
+  {
+    iset.segmentCall(
+        r, isi, detail::CallForall(),
+        ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                 IterationGetter, Concretizer,
+                                                 BlocksPerSM, true>(),
+        loop_body);
   }  // iterate over segments of index set
 
   if (!Async) RAJA::cuda::synchronize(r);
diff --git a/include/RAJA/policy/cuda/intrinsics.hpp b/include/RAJA/policy/cuda/intrinsics.hpp
index b2daa3a23e..a8daec62eb 100644
--- a/include/RAJA/policy/cuda/intrinsics.hpp
+++ b/include/RAJA/policy/cuda/intrinsics.hpp
@@ -59,15 +59,9 @@ namespace impl
  */
 struct AccessorDeviceScopeUseDeviceFence : RAJA::detail::DefaultAccessor
 {
-  static RAJA_DEVICE RAJA_INLINE void fence_acquire()
-  {
-    __threadfence();
-  }
+  static RAJA_DEVICE RAJA_INLINE void fence_acquire() { __threadfence(); }
 
-  static RAJA_DEVICE RAJA_INLINE void fence_release()
-  {
-    __threadfence();
-  }
+  static RAJA_DEVICE RAJA_INLINE void fence_release() { __threadfence(); }
 };
 
 /*!
@@ -96,46 +90,45 @@ struct AccessorDeviceScopeUseBlockFence
   static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int);
   static constexpr size_t max_atomic_int_type_size = sizeof(unsigned long long);
 
-  template < typename T >
+  template <typename T>
   static RAJA_DEVICE RAJA_INLINE T get(T* in_ptr, size_t idx)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
+                                                   max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
-    auto ptr = const_cast<integer_type*>(reinterpret_cast<const integer_type*>(in_ptr + idx));
+    auto ptr = const_cast<integer_type*>(
+        reinterpret_cast<const integer_type*>(in_ptr + idx));
 
-    for (size_t i = 0; i < u.array_size(); ++i) {
+    for (size_t i = 0; i < u.array_size(); ++i)
+    {
       u.array[i] = atomicAdd(&ptr[i], integer_type(0));
     }
 
     return u.get_value();
   }
 
-  template < typename T >
+  template <typename T>
   static RAJA_DEVICE RAJA_INLINE void set(T* in_ptr, size_t idx, T val)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
+                                                   max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
     u.set_value(val);
     auto ptr = reinterpret_cast<integer_type*>(in_ptr + idx);
 
-    for (size_t i = 0; i < u.array_size(); ++i) {
+    for (size_t i = 0; i < u.array_size(); ++i)
+    {
       atomicExch(&ptr[i], u.array[i]);
     }
   }
 
-  static RAJA_DEVICE RAJA_INLINE void fence_acquire()
-  {
-    __threadfence();
-  }
+  static RAJA_DEVICE RAJA_INLINE void fence_acquire() { __threadfence(); }
 
-  static RAJA_DEVICE RAJA_INLINE void fence_release()
-  {
-    __threadfence();
-  }
+  static RAJA_DEVICE RAJA_INLINE void fence_release() { __threadfence(); }
 };
 
 
@@ -160,10 +153,13 @@ constexpr size_t max_shfl_int_type_size = sizeof(unsigned int);
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 {
-  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
+                               max_shfl_int_type_size>
+      u;
   u.set_value(var);
 
-  for (size_t i = 0; i < u.array_size(); ++i) {
+  for (size_t i = 0; i < u.array_size(); ++i)
+  {
 #if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
     u.array[i] = ::__shfl_xor_sync(0xffffffffu, u.array[i], laneMask);
 #else
@@ -176,10 +172,13 @@ RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane)
 {
-  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
+                               max_shfl_int_type_size>
+      u;
   u.set_value(var);
 
-  for (size_t i = 0; i < u.array_size(); ++i) {
+  for (size_t i = 0; i < u.array_size(); ++i)
+  {
 #if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
     u.array[i] = ::__shfl_sync(0xffffffffu, u.array[i], srcLane);
 #else
@@ -198,7 +197,8 @@ RAJA_DEVICE RAJA_INLINE int shfl_xor_sync<int>(int var, int laneMask)
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned int shfl_xor_sync<unsigned int>(unsigned int var, int laneMask)
+RAJA_DEVICE RAJA_INLINE unsigned int
+shfl_xor_sync<unsigned int>(unsigned int var, int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
@@ -210,19 +210,22 @@ RAJA_DEVICE RAJA_INLINE long shfl_xor_sync<long>(long var, int laneMask)
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned long shfl_xor_sync<unsigned long>(unsigned long var, int laneMask)
+RAJA_DEVICE RAJA_INLINE unsigned long
+shfl_xor_sync<unsigned long>(unsigned long var, int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE long long shfl_xor_sync<long long>(long long var, int laneMask)
+RAJA_DEVICE RAJA_INLINE long long shfl_xor_sync<long long>(long long var,
+                                                           int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned long long shfl_xor_sync<unsigned long long>(unsigned long long var, int laneMask)
+RAJA_DEVICE RAJA_INLINE unsigned long long
+shfl_xor_sync<unsigned long long>(unsigned long long var, int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
@@ -265,7 +268,8 @@ RAJA_DEVICE RAJA_INLINE int shfl_sync<int>(int var, int srcLane)
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned int shfl_sync<unsigned int>(unsigned int var, int srcLane)
+RAJA_DEVICE RAJA_INLINE unsigned int shfl_sync<unsigned int>(unsigned int var,
+                                                             int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
@@ -277,19 +281,22 @@ RAJA_DEVICE RAJA_INLINE long shfl_sync<long>(long var, int srcLane)
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned long shfl_sync<unsigned long>(unsigned long var, int srcLane)
+RAJA_DEVICE RAJA_INLINE unsigned long
+shfl_sync<unsigned long>(unsigned long var, int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE long long shfl_sync<long long>(long long var, int srcLane)
+RAJA_DEVICE RAJA_INLINE long long shfl_sync<long long>(long long var,
+                                                       int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned long long shfl_sync<unsigned long long>(unsigned long long var, int srcLane)
+RAJA_DEVICE RAJA_INLINE unsigned long long
+shfl_sync<unsigned long long>(unsigned long long var, int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
@@ -334,23 +341,28 @@ RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
 
   T temp = val;
 
-  if (numThreads % policy::cuda::device_constants.WARP_SIZE == 0) {
+  if (numThreads % policy::cuda::device_constants.WARP_SIZE == 0)
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
+    {
       T rhs = shfl_xor_sync(temp, i);
-      Combiner{}(temp, rhs);
+      Combiner {}(temp, rhs);
     }
-
-  } else {
+  }
+  else
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
+    {
       int srcLane = threadId ^ i;
-      T rhs = shfl_sync(temp, srcLane);
+      T rhs       = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
-        Combiner{}(temp, rhs);
+      if (srcLane < numThreads)
+      {
+        Combiner {}(temp, rhs);
       }
     }
   }
@@ -370,9 +382,10 @@ RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val)
 {
   T temp = val;
 
-  for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+  for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
+  {
     T rhs = __shfl_xor_sync(0xffffffff, temp, i);
-    Combiner{}(temp, rhs);
+    Combiner {}(temp, rhs);
   }
 
   return temp;
@@ -388,65 +401,81 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
 
-  int warpId = threadId % policy::cuda::device_constants.WARP_SIZE;
+  int warpId  = threadId % policy::cuda::device_constants.WARP_SIZE;
   int warpNum = threadId / policy::cuda::device_constants.WARP_SIZE;
 
   T temp = val;
 
-  if (numThreads % policy::cuda::device_constants.WARP_SIZE == 0) {
+  if (numThreads % policy::cuda::device_constants.WARP_SIZE == 0)
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
+    {
       T rhs = shfl_xor_sync(temp, i);
-      Combiner{}(temp, rhs);
+      Combiner {}(temp, rhs);
     }
-
-  } else {
+  }
+  else
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
+    {
       int srcLane = threadId ^ i;
-      T rhs = shfl_sync(temp, srcLane);
+      T rhs       = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
-        Combiner{}(temp, rhs);
+      if (srcLane < numThreads)
+      {
+        Combiner {}(temp, rhs);
       }
     }
   }
 
   // reduce per warp values
-  if (numThreads > policy::cuda::device_constants.WARP_SIZE) {
+  if (numThreads > policy::cuda::device_constants.WARP_SIZE)
+  {
 
-    static_assert(policy::cuda::device_constants.MAX_WARPS <= policy::cuda::device_constants.WARP_SIZE,
-        "This algorithms assumes a warp of WARP_SIZE threads can reduce MAX_WARPS values");
+    static_assert(policy::cuda::device_constants.MAX_WARPS <=
+                      policy::cuda::device_constants.WARP_SIZE,
+                  "This algorithms assumes a warp of WARP_SIZE threads can "
+                  "reduce MAX_WARPS values");
 
     // Need to separate declaration and initialization for clang-cuda
-    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS>)];
+    __shared__ unsigned char tmpsd[sizeof(
+        RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS>)];
 
     // Partial placement new: Should call new(tmpsd) here but recasting memory
     // to avoid calling constructor/destructor in shared memory.
     RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS>* sd =
-      reinterpret_cast<RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS> *>(tmpsd);
+        reinterpret_cast<RAJA::detail::SoAArray<
+            T, policy::cuda::device_constants.MAX_WARPS>*>(tmpsd);
 
     // write per warp values to shared memory
-    if (warpId == 0) {
+    if (warpId == 0)
+    {
       sd->set(warpNum, temp);
     }
 
     __syncthreads();
 
-    if (warpNum == 0) {
+    if (warpNum == 0)
+    {
 
       // read per warp values
-      if (warpId * policy::cuda::device_constants.WARP_SIZE < numThreads) {
+      if (warpId * policy::cuda::device_constants.WARP_SIZE < numThreads)
+      {
         temp = sd->get(warpId);
-      } else {
+      }
+      else
+      {
         temp = identity;
       }
 
-      for (int i = 1; i < policy::cuda::device_constants.MAX_WARPS; i *= 2) {
+      for (int i = 1; i < policy::cuda::device_constants.MAX_WARPS; i *= 2)
+      {
         T rhs = shfl_xor_sync(temp, i);
-        Combiner{}(temp, rhs);
+        Combiner {}(temp, rhs);
       }
     }
 
diff --git a/include/RAJA/policy/cuda/kernel/Conditional.hpp b/include/RAJA/policy/cuda/kernel/Conditional.hpp
index ff15848bcb..1f0b999adc 100644
--- a/include/RAJA/policy/cuda/kernel/Conditional.hpp
+++ b/include/RAJA/policy/cuda/kernel/Conditional.hpp
@@ -42,18 +42,17 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<Data,
                              statement::If<Conditional, EnclosedStmts...>,
-                             Types> {
+                             Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
-    if (Conditional::eval(data)) {
+    if (Conditional::eval(data))
+    {
 
       // execute enclosed statements
       enclosed_stmts_t::exec(data, thread_active);
@@ -61,10 +60,7 @@ struct CudaStatementExecutor<Data,
   }
 
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
index 7465f515b0..75a7dddccb 100644
--- a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
+++ b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
@@ -51,7 +51,8 @@ namespace RAJA
  * Blocks per SM must be chosen by the user.
  */
 template <bool async0, int num_blocks, int num_threads, int blocks_per_sm>
-struct cuda_explicit_launch {};
+struct cuda_explicit_launch
+{};
 
 /*!
  * CUDA kernel launch policy where the user specifies the number of physical
@@ -67,7 +68,10 @@ struct cuda_explicit_launch {};
  * Blocks per SM defaults to 1.
  */
 template <bool async0, int num_blocks, int num_threads>
-using cuda_launch = cuda_explicit_launch<async0, num_blocks, num_threads, policy::cuda::MIN_BLOCKS_PER_SM>;
+using cuda_launch = cuda_explicit_launch<async0,
+                                         num_blocks,
+                                         num_threads,
+                                         policy::cuda::MIN_BLOCKS_PER_SM>;
 
 /*!
  * CUDA kernel launch policy where the number of physical blocks and threads
@@ -75,7 +79,11 @@ using cuda_launch = cuda_explicit_launch<async0, num_blocks, num_threads, policy
  * If num_threads is 0 then num_threads is chosen at runtime.
  */
 template <int num_threads0, bool async0>
-using cuda_occ_calc_launch = cuda_explicit_launch<async0, 0, num_threads0, policy::cuda::MIN_BLOCKS_PER_SM>;
+using cuda_occ_calc_launch =
+    cuda_explicit_launch<async0,
+                         0,
+                         num_threads0,
+                         policy::cuda::MIN_BLOCKS_PER_SM>;
 
 namespace statement
 {
@@ -87,8 +95,11 @@ namespace statement
  */
 template <typename LaunchConfig, typename... EnclosedStmts>
 struct CudaKernelExt
-    : public internal::Statement<::RAJA::policy::cuda::cuda_exec_explicit<LaunchConfig, void, void, 0, true>, EnclosedStmts...> {
-};
+    : public internal::Statement<
+          ::RAJA::policy::cuda::
+              cuda_exec_explicit<LaunchConfig, void, void, 0, true>,
+          EnclosedStmts...>
+{};
 
 
 /*!
@@ -98,8 +109,8 @@ struct CudaKernelExt
  * The kernel launch is synchronous.
  */
 template <int num_blocks, int num_threads, typename... EnclosedStmts>
-using CudaKernelExp =
-    CudaKernelExt<cuda_launch<false, num_blocks, num_threads>, EnclosedStmts...>;
+using CudaKernelExp = CudaKernelExt<cuda_launch<false, num_blocks, num_threads>,
+                                    EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a CUDA kernel with the flexibility
@@ -135,9 +146,9 @@ using CudaKernelOccAsync =
  * The kernel launch is synchronous.
  */
 template <int num_threads, typename... EnclosedStmts>
-using CudaKernelFixed =
-    CudaKernelExt<cuda_launch<false, operators::limits<int>::max(), num_threads>,
-                  EnclosedStmts...>;
+using CudaKernelFixed = CudaKernelExt<
+    cuda_launch<false, operators::limits<int>::max(), num_threads>,
+    EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a CUDA kernel with a fixed
@@ -156,7 +167,10 @@ using CudaKernelFixedAsync =
  */
 template <int num_threads, int blocks_per_sm, typename... EnclosedStmts>
 using CudaKernelFixedSM =
-    CudaKernelExt<cuda_explicit_launch<false, operators::limits<int>::max(), num_threads, blocks_per_sm>,
+    CudaKernelExt<cuda_explicit_launch<false,
+                                       operators::limits<int>::max(),
+                                       num_threads,
+                                       blocks_per_sm>,
                   EnclosedStmts...>;
 
 /*!
@@ -166,7 +180,10 @@ using CudaKernelFixedSM =
  */
 template <int num_threads, int blocks_per_sm, typename... EnclosedStmts>
 using CudaKernelFixedSMAsync =
-    CudaKernelExt<cuda_explicit_launch<true, operators::limits<int>::max(), num_threads, blocks_per_sm>,
+    CudaKernelExt<cuda_explicit_launch<true,
+                                       operators::limits<int>::max(),
+                                       num_threads,
+                                       blocks_per_sm>,
                   EnclosedStmts...>;
 
 /*!
@@ -196,7 +213,7 @@ template <typename Data, typename Exec>
 __global__ void CudaKernelLauncher(Data data)
 {
 
-  using data_t = camp::decay<Data>;
+  using data_t        = camp::decay<Data>;
   data_t private_data = data;
 
   Exec::exec(private_data, true);
@@ -215,7 +232,7 @@ __launch_bounds__(BlockSize, BlocksPerSM) __global__
     void CudaKernelLauncherFixed(Data data)
 {
 
-  using data_t = camp::decay<Data>;
+  using data_t        = camp::decay<Data>;
   data_t private_data = data;
 
   // execute the the object
@@ -231,13 +248,18 @@ __launch_bounds__(BlockSize, BlocksPerSM) __global__
  * The default case handles BlockSize != 0 and gets the fixed max block size
  * version of the kernel.
  */
-template<int BlockSize, int BlocksPerSM, typename Data, typename executor_t>
+template <int BlockSize, int BlocksPerSM, typename Data, typename executor_t>
 struct CudaKernelLauncherGetter
 {
-  using type = camp::decay<decltype(&internal::CudaKernelLauncherFixed<BlockSize, BlocksPerSM, Data, executor_t>)>;
+  using type =
+      camp::decay<decltype(&internal::CudaKernelLauncherFixed<BlockSize,
+                                                              BlocksPerSM,
+                                                              Data,
+                                                              executor_t>)>;
   static constexpr type get() noexcept
   {
-    return &internal::CudaKernelLauncherFixed<BlockSize, BlocksPerSM, Data, executor_t>;
+    return &internal::CudaKernelLauncherFixed<BlockSize, BlocksPerSM, Data,
+                                              executor_t>;
   }
 };
 
@@ -245,10 +267,11 @@ struct CudaKernelLauncherGetter
  * Helper class specialization for BlockSize == 0 and gets the unfixed max
  * block size version of the kernel.
  */
-template<typename Data, typename executor_t>
+template <typename Data, typename executor_t>
 struct CudaKernelLauncherGetter<0, 0, Data, executor_t>
 {
-  using type = camp::decay<decltype(&internal::CudaKernelLauncher<Data, executor_t>)>;
+  using type =
+      camp::decay<decltype(&internal::CudaKernelLauncher<Data, executor_t>)>;
   static constexpr type get() noexcept
   {
     return &internal::CudaKernelLauncher<Data, executor_t>;
@@ -256,12 +279,14 @@ struct CudaKernelLauncherGetter<0, 0, Data, executor_t>
 };
 
 
-
 /*!
  * Helper class that handles CUDA kernel launching, and computing
  * maximum number of threads/blocks
  */
-template<typename LaunchPolicy, typename StmtList, typename Data, typename Types>
+template <typename LaunchPolicy,
+          typename StmtList,
+          typename Data,
+          typename Types>
 struct CudaLaunchHelper;
 
 
@@ -270,16 +295,31 @@ struct CudaLaunchHelper;
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the CUDA occupancy calculator.
  */
-template<bool async0, int num_blocks, int num_threads, int blocks_per_sm, typename StmtList, typename Data, typename Types>
-struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, blocks_per_sm>,StmtList,Data,Types>
+template <bool async0,
+          int num_blocks,
+          int num_threads,
+          int blocks_per_sm,
+          typename StmtList,
+          typename Data,
+          typename Types>
+struct CudaLaunchHelper<
+    cuda_explicit_launch<async0, num_blocks, num_threads, blocks_per_sm>,
+    StmtList,
+    Data,
+    Types>
 {
   using Self = CudaLaunchHelper;
 
   static constexpr bool async = async0;
 
-  using executor_t = internal::cuda_statement_list_executor_t<StmtList, Data, Types>;
+  using executor_t =
+      internal::cuda_statement_list_executor_t<StmtList, Data, Types>;
 
-  using kernelGetter_t = CudaKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads, (blocks_per_sm <= 0) ? 0 : blocks_per_sm, Data, executor_t>;
+  using kernelGetter_t =
+      CudaKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads,
+                               (blocks_per_sm <= 0) ? 0 : blocks_per_sm,
+                               Data,
+                               executor_t>;
 
   inline static const void* get_func()
   {
@@ -287,13 +327,16 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
   }
 
   inline static void recommended_blocks_threads(size_t shmem_size,
-      int &recommended_blocks, int &recommended_threads)
+                                                int& recommended_blocks,
+                                                int& recommended_threads)
   {
     auto func = Self::get_func();
 
-    if (num_blocks <= 0) {
+    if (num_blocks <= 0)
+    {
 
-      if (num_threads <= 0) {
+      if (num_threads <= 0)
+      {
 
         //
         // determine blocks at runtime
@@ -301,10 +344,11 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
         //
         auto data = ::RAJA::cuda::cuda_occupancy_max_blocks_threads<Self>(
             func, shmem_size);
-        recommended_blocks = data.func_max_blocks_per_device;
+        recommended_blocks  = data.func_max_blocks_per_device;
         recommended_threads = data.func_max_threads_per_block;
-
-      } else {
+      }
+      else
+      {
 
         //
         // determine blocks at runtime
@@ -314,69 +358,73 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
 
         auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self, num_threads>(
             func, shmem_size);
-        recommended_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-
+        recommended_blocks =
+            data.func_max_blocks_per_sm * data.device_sm_per_device;
       }
+    }
+    else
+    {
 
-    } else {
-
-      if (num_threads <= 0) {
+      if (num_threads <= 0)
+      {
 
         //
         // determine threads at runtime, unsure what use 1024
         // this value may be invalid for kernels with high register pressure
         //
         recommended_threads = 1024;
-
-      } else {
+      }
+      else
+      {
 
         //
         // threads determined at compile-time
         //
         recommended_threads = num_threads;
-
       }
 
       //
       // blocks determined at compile-time
       //
       recommended_blocks = num_blocks;
-
     }
   }
 
-  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size), int &max_threads)
+  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size),
+                                 int& max_threads)
   {
-    if (num_threads <= 0) {
+    if (num_threads <= 0)
+    {
 
       //
       // determine threads at runtime, unsure what use 1024
       // this value may be invalid for kernels with high register pressure
       //
       max_threads = 1024;
-
-    } else {
+    }
+    else
+    {
 
       //
       // threads determined at compile-time
       //
       max_threads = num_threads;
-
     }
   }
 
-  inline static void max_blocks(size_t shmem_size,
-      int &max_blocks, int actual_threads)
+  inline static void
+  max_blocks(size_t shmem_size, int& max_blocks, int actual_threads)
   {
     auto func = Self::get_func();
 
-    if (num_blocks <= 0) {
+    if (num_blocks <= 0)
+    {
 
       //
       // determine blocks at runtime
       //
-      if (num_threads <= 0 ||
-          num_threads != actual_threads) {
+      if (num_threads <= 0 || num_threads != actual_threads)
+      {
 
         //
         // determine blocks when actual_threads != num_threads
@@ -384,8 +432,9 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
         auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self>(
             func, shmem_size, actual_threads);
         max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-
-      } else {
+      }
+      else
+      {
 
         //
         // determine blocks when actual_threads == num_threads
@@ -393,16 +442,15 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
         auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self, num_threads>(
             func, shmem_size);
         max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-
       }
-
-    } else {
+    }
+    else
+    {
 
       //
       // blocks determined at compile-time
       //
       max_blocks = num_blocks;
-
     }
   }
 };
@@ -416,8 +464,10 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
  * The algorithm is greedy (and probably could be improved), and favors
  * maximizing the number of threads (or blocks) in x, y, then z.
  */
-inline
-cuda_dim_t fitCudaDims(cuda_dim_member_t limit, cuda_dim_t result, cuda_dim_t minimum = cuda_dim_t()){
+inline cuda_dim_t fitCudaDims(cuda_dim_member_t limit,
+                              cuda_dim_t result,
+                              cuda_dim_t minimum = cuda_dim_t())
+{
 
 
   // clamp things to at least 1
@@ -430,12 +480,13 @@ cuda_dim_t fitCudaDims(cuda_dim_member_t limit, cuda_dim_t result, cuda_dim_t mi
   minimum.z = minimum.z ? minimum.z : 1;
 
   // if we are under the limit, we're done
-  if(result.x * result.y * result.z <= limit) return result;
+  if (result.x * result.y * result.z <= limit) return result;
 
   // Can we reduce z to fit?
-  if(result.x * result.y * minimum.z < limit){
+  if (result.x * result.y * minimum.z < limit)
+  {
     // compute a new z
-    result.z = limit / (result.x*result.y);
+    result.z = limit / (result.x * result.y);
     return result;
   }
   // we don't fit, so reduce z to it's minimum and continue on to y
@@ -443,9 +494,10 @@ cuda_dim_t fitCudaDims(cuda_dim_member_t limit, cuda_dim_t result, cuda_dim_t mi
 
 
   // Can we reduce y to fit?
-  if(result.x * minimum.y * result.z < limit){
+  if (result.x * minimum.y * result.z < limit)
+  {
     // compute a new y
-    result.y = limit / (result.x*result.z);
+    result.y = limit / (result.x * result.z);
     return result;
   }
   // we don't fit, so reduce y to it's minimum and continue on to x
@@ -453,9 +505,10 @@ cuda_dim_t fitCudaDims(cuda_dim_member_t limit, cuda_dim_t result, cuda_dim_t mi
 
 
   // Can we reduce y to fit?
-  if(minimum.x * result.y * result.z < limit){
+  if (minimum.x * result.y * result.z < limit)
+  {
     // compute a new x
-    result.x = limit / (result.y*result.z);
+    result.x = limit / (result.y * result.z);
     return result;
   }
   // we don't fit, so we'll return the smallest possible thing
@@ -470,18 +523,21 @@ cuda_dim_t fitCudaDims(cuda_dim_member_t limit, cuda_dim_t result, cuda_dim_t mi
  */
 template <typename LaunchConfig, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::CudaKernelExt<LaunchConfig, EnclosedStmts...>, Types> {
+    statement::CudaKernelExt<LaunchConfig, EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
   using StatementType =
       statement::CudaKernelExt<LaunchConfig, EnclosedStmts...>;
 
   template <typename Data>
-  static inline void exec(Data &&data)
+  static inline void exec(Data&& data)
   {
 
     using data_t = camp::decay<Data>;
-    using executor_t = cuda_statement_list_executor_t<stmt_list_t, data_t, Types>;
+    using executor_t =
+        cuda_statement_list_executor_t<stmt_list_t, data_t, Types>;
     using launch_t = CudaLaunchHelper<LaunchConfig, stmt_list_t, data_t, Types>;
 
 
@@ -495,9 +551,10 @@ struct StatementExecutor<
 
 
     // Only launch kernel if we have something to iterate over
-    int num_blocks = launch_dims.num_blocks();
+    int num_blocks  = launch_dims.num_blocks();
     int num_threads = launch_dims.num_threads();
-    if (num_blocks > 0 || num_threads > 0) {
+    if (num_blocks > 0 || num_threads > 0)
+    {
 
       //
       // Setup shared memory buffers
@@ -510,8 +567,8 @@ struct StatementExecutor<
       //
       int recommended_blocks;
       int recommended_threads;
-      launch_t::recommended_blocks_threads(
-          shmem, recommended_blocks, recommended_threads);
+      launch_t::recommended_blocks_threads(shmem, recommended_blocks,
+                                           recommended_threads);
 
 
       //
@@ -524,24 +581,24 @@ struct StatementExecutor<
       //
       // Fit the requested threads
       //
-      cuda_dim_t fit_threads{0,0,0};
+      cuda_dim_t fit_threads {0, 0, 0};
 
-      if ( recommended_threads >= get_size(launch_dims.min_dims.threads) ) {
-
-        fit_threads = fitCudaDims(
-            recommended_threads, launch_dims.dims.threads, launch_dims.min_dims.threads);
+      if (recommended_threads >= get_size(launch_dims.min_dims.threads))
+      {
 
+        fit_threads = fitCudaDims(recommended_threads, launch_dims.dims.threads,
+                                  launch_dims.min_dims.threads);
       }
 
       //
       // Redo fit with max threads
       //
-      if ( recommended_threads < max_threads &&
-           get_size(fit_threads) != recommended_threads ) {
-
-        fit_threads = fitCudaDims(
-            max_threads, launch_dims.dims.threads, launch_dims.min_dims.threads);
+      if (recommended_threads < max_threads &&
+          get_size(fit_threads) != recommended_threads)
+      {
 
+        fit_threads = fitCudaDims(max_threads, launch_dims.dims.threads,
+                                  launch_dims.min_dims.threads);
       }
 
       launch_dims.dims.threads = fit_threads;
@@ -555,24 +612,25 @@ struct StatementExecutor<
 
       int use_blocks;
 
-      if ( launch_dims.num_threads() == recommended_threads ) {
+      if (launch_dims.num_threads() == recommended_threads)
+      {
 
         //
         // Fit the requested blocks
         //
         use_blocks = recommended_blocks;
-
-      } else {
+      }
+      else
+      {
 
         //
         // Fit the max blocks
         //
         use_blocks = max_blocks;
-
       }
 
-      launch_dims.dims.blocks = fitCudaDims(
-          use_blocks, launch_dims.dims.blocks, launch_dims.min_dims.blocks);
+      launch_dims.dims.blocks = fitCudaDims(use_blocks, launch_dims.dims.blocks,
+                                            launch_dims.min_dims.blocks);
 
       //
       // make sure that we fit
@@ -581,7 +639,8 @@ struct StatementExecutor<
       if(launch_dims.num_blocks() > max_blocks){
         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num blocks");
       }*/
-      if(launch_dims.num_threads() > max_threads){
+      if (launch_dims.num_threads() > max_threads)
+      {
         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num threads");
       }
 
@@ -595,14 +654,17 @@ struct StatementExecutor<
         // of the launch_dims and potential changes to shmem here that is
         // currently an unresolved issue.
         //
-        auto cuda_data = RAJA::cuda::make_launch_body(func,
-            launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res, data);
+        auto cuda_data = RAJA::cuda::make_launch_body(
+            func, launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res,
+            data);
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&cuda_data};
-        RAJA::cuda::launch(func, launch_dims.dims.blocks, launch_dims.dims.threads, args, shmem, res, launch_t::async);
+        void* args[] = {(void*)&cuda_data};
+        RAJA::cuda::launch(func, launch_dims.dims.blocks,
+                           launch_dims.dims.threads, args, shmem, res,
+                           launch_t::async);
       }
     }
   }
diff --git a/include/RAJA/policy/cuda/kernel/For.hpp b/include/RAJA/policy/cuda/kernel/For.hpp
index 58ffa1ba14..3176fd5bf8 100644
--- a/include/RAJA/policy/cuda/kernel/For.hpp
+++ b/include/RAJA/policy/cuda/kernel/For.hpp
@@ -45,9 +45,12 @@ template <typename Data,
 struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    IndexMapper>,
                    EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -60,13 +63,13 @@ struct CudaStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
+      RAJA::policy::cuda::
+          cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -79,14 +82,13 @@ struct CudaStatementExecutor<
     enclosed_stmts_t::exec(data, thread_active && have_work);
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
     CudaDims my_dims(0), my_min_dims(0);
     DimensionCalculator::set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    LaunchDims dims {my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -108,9 +110,13 @@ template <typename Data,
 struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
+                   RAJA::policy::cuda::cuda_indexer<
+                       iteration_mapping::StridedLoop<named_usage::unspecified>,
+                       kernel_sync_requirement::sync,
+                       IndexMapper>,
                    EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -123,20 +129,23 @@ struct CudaStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
+      RAJA::policy::cuda::cuda_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::sync,
+          IndexMapper>>;
 
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -151,14 +160,13 @@ struct CudaStatementExecutor<
     }
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     diff_t len = segment_length<ArgumentId>(data);
 
     CudaDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -180,9 +188,13 @@ template <typename Data,
 struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
+                   RAJA::policy::cuda::cuda_indexer<
+                       iteration_mapping::StridedLoop<named_usage::unspecified>,
+                       kernel_sync_requirement::none,
+                       IndexMapper>,
                    EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -195,20 +207,23 @@ struct CudaStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
+      RAJA::policy::cuda::cuda_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::none,
+          IndexMapper>>;
 
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through one at a time
     // threads will have different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign the index to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -218,14 +233,13 @@ struct CudaStatementExecutor<
     }
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
     CudaDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -245,14 +259,19 @@ struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
     Types>
-: CudaStatementExecutor<Data, statement::For<ArgumentId,
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                     kernel_sync_requirement::none,
-                                     cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-      EnclosedStmts...>, Types>
-{
-
-};
+    : CudaStatementExecutor<
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  cuda::IndexGlobal<named_dim::x,
+                                    named_usage::ignored,
+                                    named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
 
 
 /*
@@ -263,33 +282,32 @@ struct CudaStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct CudaStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::cuda_warp_masked_direct<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+struct CudaStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::cuda_warp_masked_direct<Mask>,
+                                            EnclosedStmts...>,
+                             Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -299,13 +317,11 @@ struct CudaStatementExecutor<
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
@@ -320,7 +336,7 @@ struct CudaStatementExecutor<
     // since we are direct-mapping, we REQUIRE len
     set_cuda_dim<named_dim::x>(dims.min_dims.threads, len);
 
-    return(dims);
+    return (dims);
   }
 };
 
@@ -332,41 +348,41 @@ struct CudaStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct CudaStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::cuda_warp_masked_loop<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+struct CudaStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::cuda_warp_masked_loop<Mask>,
+                                            EnclosedStmts...>,
+                             Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -382,9 +398,7 @@ struct CudaStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
@@ -399,7 +413,7 @@ struct CudaStatementExecutor<
     // since we are direct-mapping, we REQUIRE len
     set_cuda_dim<named_dim::x>(dims.min_dims.threads, len);
 
-    return(dims);
+    return (dims);
   }
 };
 
@@ -411,30 +425,29 @@ struct CudaStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct CudaStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::cuda_thread_masked_direct<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+    Data,
+    statement::For<ArgumentId,
+                   RAJA::cuda_thread_masked_direct<Mask>,
+                   EnclosedStmts...>,
+    Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -444,13 +457,11 @@ struct CudaStatementExecutor<
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims;
@@ -466,7 +477,7 @@ struct CudaStatementExecutor<
     set_cuda_dim<named_dim::x>(dims.min_dims.threads, len);
 
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
@@ -478,39 +489,38 @@ struct CudaStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct CudaStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::cuda_thread_masked_loop<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+struct CudaStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::cuda_thread_masked_loop<Mask>,
+                                            EnclosedStmts...>,
+                             Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -526,9 +536,7 @@ struct CudaStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims;
@@ -544,7 +552,7 @@ struct CudaStatementExecutor<
     set_cuda_dim<named_dim::x>(dims.min_dims.threads, len);
 
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
diff --git a/include/RAJA/policy/cuda/kernel/ForICount.hpp b/include/RAJA/policy/cuda/kernel/ForICount.hpp
index 87556ed8b1..18e11fb989 100644
--- a/include/RAJA/policy/cuda/kernel/ForICount.hpp
+++ b/include/RAJA/policy/cuda/kernel/ForICount.hpp
@@ -46,33 +46,40 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::cuda::
+            cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : CudaStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::cuda::
+                  cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = CudaStatementExecutor<
       Data,
       statement::For<ArgumentId,
-                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                                                      sync,
+                                                      IndexMapper>,
                      EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -102,38 +109,52 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : public CudaStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::sync,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::For<ArgumentId,
-                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                     EnclosedStmts...>,
+      statement::For<
+          ArgumentId,
+          RAJA::policy::cuda::cuda_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::sync,
+              IndexMapper>,
+          EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -165,38 +186,52 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : public CudaStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::For<ArgumentId,
-                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                     EnclosedStmts...>,
+      statement::For<
+          ArgumentId,
+          RAJA::policy::cuda::cuda_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::none,
+              IndexMapper>,
+          EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign the index to the argument and param
       data.template assign_offset<ArgumentId>(i);
@@ -225,14 +260,19 @@ struct CudaStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>,
     Types>
-: CudaStatementExecutor<Data, statement::ForICount<ArgumentId,
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                     kernel_sync_requirement::none,
-                                     cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-      EnclosedStmts...>, Types>
-{
-
-};
+    : CudaStatementExecutor<
+          Data,
+          statement::ForICount<
+              ArgumentId,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  cuda::IndexGlobal<named_dim::x,
+                                    named_usage::ignored,
+                                    named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
 
 
 /*
@@ -244,40 +284,47 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct CudaStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::cuda_warp_masked_direct<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::cuda_warp_masked_direct<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = CudaStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::cuda_warp_masked_direct<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public CudaStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::cuda_warp_masked_direct<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::cuda_warp_masked_direct<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      CudaStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::cuda_warp_masked_direct<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -288,9 +335,8 @@ struct CudaStatementExecutor<
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
-
 };
 
 
@@ -303,48 +349,56 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct CudaStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::cuda_warp_masked_loop<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::cuda_warp_masked_loop<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = CudaStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::cuda_warp_masked_loop<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public CudaStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::cuda_warp_masked_loop<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::cuda_warp_masked_loop<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      CudaStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::cuda_warp_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -359,7 +413,6 @@ struct CudaStatementExecutor<
       enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
-
 };
 
 
@@ -372,37 +425,43 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct CudaStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::cuda_thread_masked_direct<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::cuda_thread_masked_direct<Mask>,
-                   EnclosedStmts ...>, Types > {
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::cuda_thread_masked_direct<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public CudaStatementExecutor<
+          Data,
+          statement::For<ArgumentId,
+                         RAJA::cuda_thread_masked_direct<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
 
   using Base = CudaStatementExecutor<
-          Data,
-          statement::For<ArgumentId, RAJA::cuda_thread_masked_direct<Mask>,
-                         EnclosedStmts ...>, Types >;
+      Data,
+      statement::For<ArgumentId,
+                     RAJA::cuda_thread_masked_direct<Mask>,
+                     EnclosedStmts...>,
+      Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -413,9 +472,8 @@ struct CudaStatementExecutor<
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
-
 };
 
 
@@ -428,45 +486,52 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct CudaStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::cuda_thread_masked_loop<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::cuda_thread_masked_loop<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = CudaStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::cuda_thread_masked_loop<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public CudaStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::cuda_thread_masked_loop<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::cuda_thread_masked_loop<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      CudaStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::cuda_thread_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -481,7 +546,6 @@ struct CudaStatementExecutor<
       enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
-
 };
 
 }  // namespace internal
diff --git a/include/RAJA/policy/cuda/kernel/Hyperplane.hpp b/include/RAJA/policy/cuda/kernel/Hyperplane.hpp
index fd33192a65..74c02b8608 100644
--- a/include/RAJA/policy/cuda/kernel/Hyperplane.hpp
+++ b/include/RAJA/policy/cuda/kernel/Hyperplane.hpp
@@ -41,33 +41,31 @@ template <typename Data,
           camp::idx_t... Args,
           typename... EnclosedStmts,
           typename Types>
-struct CudaStatementExecutor<Data,
-                             statement::Hyperplane<HpArgumentId,
-                                                   seq_exec,
-                                                   ArgList<Args...>,
-                                                   EnclosedStmts...>,
-                             Types> {
+struct CudaStatementExecutor<
+    Data,
+    statement::
+        Hyperplane<HpArgumentId, seq_exec, ArgList<Args...>, EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, HpArgumentId, Data>;
 
-  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+  using enclosed_stmts_t =
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // compute Manhattan distance of iteration space to determine
     // as:  hp_len = l0 + l1 + l2 + ...
-    int hp_len = segment_length<HpArgumentId>(data) +
-                 foldl(RAJA::operators::plus<int>(),
-                               segment_length<Args>(data)...);
+    int hp_len =
+        segment_length<HpArgumentId>(data) +
+        foldl(RAJA::operators::plus<int>(), segment_length<Args>(data)...);
 
     int h_args = foldl(RAJA::operators::plus<idx_t>(),
-        camp::get<Args>(data.offset_tuple)...);
+                       camp::get<Args>(data.offset_tuple)...);
 
     // get length of i dimension
     auto i_len = segment_length<HpArgumentId>(data);
@@ -79,7 +77,8 @@ struct CudaStatementExecutor<Data,
      * We reject the iterations that lie outside of the specified rectangular
      * region we are sweeping.
      */
-    for (int h = 0; h < hp_len; ++h) {
+    for (int h = 0; h < hp_len; ++h)
+    {
 
       // compute actual iterate for HpArgumentId
       // as:  i0 = h - (i1 + i2 + i3 + ...)
@@ -93,18 +92,13 @@ struct CudaStatementExecutor<Data,
   }
 
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
 };
 
 
-
-
 }  // end namespace internal
 
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp b/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
index 258cd204d6..018d9d0dfd 100644
--- a/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
+++ b/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
@@ -39,27 +39,30 @@ struct cuda_shared_mem;
 namespace internal
 {
 
-//Intialize thread shared array
-template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
+// Intialize thread shared array
+template <typename Data,
+          camp::idx_t... Indices,
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<Data,
                              statement::InitLocalMem<RAJA::cuda_shared_mem,
-                             camp::idx_seq<Indices...>, EnclosedStmts...>,
+                                                     camp::idx_seq<Indices...>,
+                                                     EnclosedStmts...>,
                              Types>
 {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  //Launch loops
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Launch loops
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     __shared__ varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -67,40 +70,33 @@ struct CudaStatementExecutor<Data,
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-  //Intialize local array
-  //Identifies type + number of elements needed
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Intialize local array
+  // Identifies type + number of elements needed
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     __shared__ varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
     initMem<other0, others...>(data, thread_active);
   }
 
-  //Set pointer to null base case
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null base case
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
   }
 
 
-  //Set pointer to null recursive case
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null recursive case
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
@@ -108,47 +104,47 @@ struct CudaStatementExecutor<Data,
   }
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
 
-    //Intialize scoped arrays + launch loops
+    // Intialize scoped arrays + launch loops
     initMem<Indices...>(data, thread_active);
 
-    //set pointers in scoped arrays to null
+    // set pointers in scoped arrays to null
     setPtrToNull<Indices...>(data);
   }
 
 
-  inline
-  static
-  LaunchDims calculateDimensions(Data const &data)
+  inline static LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
-
 };
 
-//Intialize thread private array
-template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
-struct CudaStatementExecutor<Data, statement::InitLocalMem<RAJA::cuda_thread_mem, camp::idx_seq<Indices...>, EnclosedStmts...>, Types>
+// Intialize thread private array
+template <typename Data,
+          camp::idx_t... Indices,
+          typename... EnclosedStmts,
+          typename Types>
+struct CudaStatementExecutor<Data,
+                             statement::InitLocalMem<RAJA::cuda_thread_mem,
+                                                     camp::idx_seq<Indices...>,
+                                                     EnclosedStmts...>,
+                             Types>
 {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  //Launch loops
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Launch loops
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -156,40 +152,33 @@ struct CudaStatementExecutor<Data, statement::InitLocalMem<RAJA::cuda_thread_mem
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-  //Intialize local array
-  //Identifies type + number of elements needed
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Intialize local array
+  // Identifies type + number of elements needed
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
     initMem<other0, others...>(data, thread_active);
   }
 
-  //Set pointer to null base case
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null base case
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
   }
 
 
-  //Set pointer to null recursive case
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null recursive case
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
@@ -197,31 +186,24 @@ struct CudaStatementExecutor<Data, statement::InitLocalMem<RAJA::cuda_thread_mem
   }
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
 
-    //Intialize scoped arrays + launch loops
+    // Intialize scoped arrays + launch loops
     initMem<Indices...>(data, thread_active);
 
-    //set pointers in scoped arrays to null
+    // set pointers in scoped arrays to null
     setPtrToNull<Indices...>(data);
   }
 
 
-  inline
-  static
-  LaunchDims calculateDimensions(Data const &data)
+  inline static LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
-
 };
 
 
-
 }  // namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/cuda/kernel/Lambda.hpp b/include/RAJA/policy/cuda/kernel/Lambda.hpp
index e932a3e270..37287561fd 100644
--- a/include/RAJA/policy/cuda/kernel/Lambda.hpp
+++ b/include/RAJA/policy/cuda/kernel/Lambda.hpp
@@ -40,30 +40,34 @@ namespace RAJA
 namespace internal
 {
 
-template <typename Data, camp::idx_t LambdaIndex, typename... Args, typename Types>
-struct CudaStatementExecutor<Data, statement::Lambda<LambdaIndex, Args...>, Types> {
+template <typename Data,
+          camp::idx_t LambdaIndex,
+          typename... Args,
+          typename Types>
+struct CudaStatementExecutor<Data,
+                             statement::Lambda<LambdaIndex, Args...>,
+                             Types>
+{
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Only execute the lambda if it hasn't been masked off
-    if(thread_active){
-      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(data);
+    if (thread_active)
+    {
+      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(
+          data);
     }
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims
+  calculateDimensions(Data const& RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
 };
 
 
-
-
 }  // namespace internal
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/cuda/kernel/Reduce.hpp b/include/RAJA/policy/cuda/kernel/Reduce.hpp
index 7e46748991..dfa80667a0 100644
--- a/include/RAJA/policy/cuda/kernel/Reduce.hpp
+++ b/include/RAJA/policy/cuda/kernel/Reduce.hpp
@@ -35,7 +35,8 @@ namespace internal
 // Executor that handles reductions across a single CUDA thread block
 //
 template <typename Data,
-          template <typename...> class ReduceOperator,
+          template <typename...>
+          class ReduceOperator,
           typename ParamId,
           typename... EnclosedStmts,
           typename Types>
@@ -44,22 +45,24 @@ struct CudaStatementExecutor<Data,
                                                ReduceOperator,
                                                ParamId,
                                                EnclosedStmts...>,
-                             Types> {
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // block reduce on the specified parameter
-    auto value = data.template get_param<ParamId>();
+    auto value    = data.template get_param<ParamId>();
     using value_t = decltype(value);
     value_t ident = value_t();
 
     // if this thread isn't active, just set it to the identity
-    if (!thread_active) {
+    if (!thread_active)
+    {
       value = ident;
     }
 
@@ -73,7 +76,8 @@ struct CudaStatementExecutor<Data,
 
     // execute enclosed statements, and mask off everyone but thread 0
     thread_active = threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0;
-    if(thread_active){
+    if (thread_active)
+    {
       // Only update to new value on root thread
       data.template assign_param<ParamId>(new_value);
     }
@@ -81,7 +85,7 @@ struct CudaStatementExecutor<Data,
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -94,7 +98,8 @@ struct CudaStatementExecutor<Data,
 // Executor that handles reductions across a single CUDA thread warp
 //
 template <typename Data,
-          template <typename...> class ReduceOperator,
+          template <typename...>
+          class ReduceOperator,
           typename ParamId,
           typename... EnclosedStmts,
           typename Types>
@@ -103,35 +108,37 @@ struct CudaStatementExecutor<Data,
                                                ReduceOperator,
                                                ParamId,
                                                EnclosedStmts...>,
-                             Types> {
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // block reduce on the specified parameter
-    auto value = data.template get_param<ParamId>();
+    auto value    = data.template get_param<ParamId>();
     using value_t = decltype(value);
     value_t ident = value_t();
 
     // if this thread isn't active, just set it to the identity
-    if (!thread_active) {
+    if (!thread_active)
+    {
       value = ident;
     }
 
     // Call warp reduction routine
     using combiner_t =
         RAJA::reduce::detail::op_adapter<value_t, ReduceOperator>;
-    value_t new_value =
-        RAJA::cuda::impl::warp_reduce<combiner_t>(value, ident);
+    value_t new_value = RAJA::cuda::impl::warp_reduce<combiner_t>(value, ident);
     data.template assign_param<ParamId>(new_value);
 
     // execute enclosed statements, and mask off everyone but lane 0
     thread_active = threadIdx.x == 0;
-    if(thread_active){
+    if (thread_active)
+    {
       // Only update to new value on root thread
       data.template assign_param<ParamId>(new_value);
     }
@@ -139,7 +146,7 @@ struct CudaStatementExecutor<Data,
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -148,7 +155,6 @@ struct CudaStatementExecutor<Data,
 };
 
 
-
 }  // namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/cuda/kernel/Sync.hpp b/include/RAJA/policy/cuda/kernel/Sync.hpp
index 7dd45d8837..e750c6bfc0 100644
--- a/include/RAJA/policy/cuda/kernel/Sync.hpp
+++ b/include/RAJA/policy/cuda/kernel/Sync.hpp
@@ -43,14 +43,14 @@ namespace statement
 /*!
  * A RAJA::kernel statement that performs a CUDA __syncthreads().
  */
-struct CudaSyncThreads : public internal::Statement<camp::nil> {
-};
+struct CudaSyncThreads : public internal::Statement<camp::nil>
+{};
 
 /*!
  * A RAJA::kernel statement that performs a CUDA __syncwarp().
  */
-struct CudaSyncWarp : public internal::Statement<camp::nil> {
-};
+struct CudaSyncWarp : public internal::Statement<camp::nil>
+{};
 
 }  // namespace statement
 
@@ -58,37 +58,39 @@ namespace internal
 {
 
 template <typename Data, typename Types>
-struct CudaStatementExecutor<Data, statement::CudaSyncThreads, Types> {
+struct CudaStatementExecutor<Data, statement::CudaSyncThreads, Types>
+{
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &, bool) { __syncthreads(); }
+  static inline RAJA_DEVICE void exec(Data&, bool) { __syncthreads(); }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims
+  calculateDimensions(Data const& RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
 };
 
 template <typename Data, typename Types>
-struct CudaStatementExecutor<Data, statement::CudaSyncWarp, Types> {
+struct CudaStatementExecutor<Data, statement::CudaSyncWarp, Types>
+{
 
-  static
-  inline
-  RAJA_DEVICE
+  static inline RAJA_DEVICE
 #if CUDART_VERSION >= 9000
-  void exec(Data &, bool) { __syncwarp(); }
+      void
+      exec(Data&, bool)
+  {
+    __syncwarp();
+  }
 #else
-  void exec(Data &, bool) {  }
+      void
+      exec(Data&, bool)
+  {
+  }
 #endif
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims
+  calculateDimensions(Data const& RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
diff --git a/include/RAJA/policy/cuda/kernel/Tile.hpp b/include/RAJA/policy/cuda/kernel/Tile.hpp
index ad901f6b02..a7f36c54b7 100644
--- a/include/RAJA/policy/cuda/kernel/Tile.hpp
+++ b/include/RAJA/policy/cuda/kernel/Tile.hpp
@@ -58,10 +58,12 @@ struct CudaStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                                                     sync,
+                                                     IndexMapper>,
                     EnclosedStmts...>,
-                    Types>
-  {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -69,19 +71,21 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<
+      RAJA::policy::cuda::
+          cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -100,23 +104,23 @@ struct CudaStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     CudaDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, static_cast<diff_t>(chunk_size));
@@ -141,11 +145,16 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                    EnclosedStmts...>, Types>
-  {
+    statement::Tile<
+        ArgumentId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -153,26 +162,32 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
+  using DimensionCalculator =
+      KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::sync,
+          IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -190,23 +205,23 @@ struct CudaStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     CudaDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -231,11 +246,16 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                    EnclosedStmts...>, Types>
-  {
+    statement::Tile<
+        ArgumentId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -243,26 +263,32 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
+  using DimensionCalculator =
+      KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::none,
+          IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
@@ -275,23 +301,23 @@ struct CudaStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     CudaDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -316,15 +342,22 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>
-: CudaStatementExecutor<Data, statement::Tile<ArgumentId, TPol,
-    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                   kernel_sync_requirement::none,
-                                   cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-    EnclosedStmts...>, Types>
-{
-
-};
+    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
+    Types>
+    : CudaStatementExecutor<
+          Data,
+          statement::Tile<
+              ArgumentId,
+              TPol,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  cuda::IndexGlobal<named_dim::x,
+                                    named_usage::ignored,
+                                    named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
 
 }  // end namespace internal
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/cuda/kernel/TileTCount.hpp b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
index c611346d46..377ac4edff 100644
--- a/include/RAJA/policy/cuda/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
@@ -58,42 +58,49 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::cuda::
+            cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public CudaStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::cuda::
+                  cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                      EnclosedStmts...>,
-                      Types>;
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          RAJA::policy::cuda::
+              cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+          EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t t = IndexMapper::template index<diff_t>();
-    const diff_t i = t * static_cast<diff_t>(chunk_size);
+    const diff_t t   = IndexMapper::template index<diff_t>();
+    const diff_t i   = t * static_cast<diff_t>(chunk_size);
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -129,50 +136,64 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public CudaStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::sync,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                      EnclosedStmts...>,
-                      Types>;
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          RAJA::policy::cuda::cuda_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::sync,
+              IndexMapper>,
+          EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t t_init = IndexMapper::template index<diff_t>();
-    const diff_t i_init = t_init * static_cast<diff_t>(chunk_size);
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t t_init   = IndexMapper::template index<diff_t>();
+    const diff_t i_init   = t_init * static_cast<diff_t>(chunk_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(chunk_size);
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for(diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride) {
+    for (diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -207,50 +228,64 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public CudaStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                      EnclosedStmts...>,
-                      Types>;
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          RAJA::policy::cuda::cuda_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::none,
+              IndexMapper>,
+          EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t t_init = IndexMapper::template index<diff_t>();
-    const diff_t i_init = t_init * static_cast<diff_t>(chunk_size);
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t t_init   = IndexMapper::template index<diff_t>();
+    const diff_t i_init   = t_init * static_cast<diff_t>(chunk_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(chunk_size);
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for(diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
+    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
@@ -279,15 +314,24 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>, Types>
-: CudaStatementExecutor<Data, statement::TileTCount<ArgumentId, ParamId, TPol,
-    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                   kernel_sync_requirement::none,
-                                   cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-    EnclosedStmts...>, Types>
-{
-
-};
+    statement::
+        TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>,
+    Types>
+    : CudaStatementExecutor<
+          Data,
+          statement::TileTCount<
+              ArgumentId,
+              ParamId,
+              TPol,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  cuda::IndexGlobal<named_dim::x,
+                                    named_usage::ignored,
+                                    named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
 
 }  // end namespace internal
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/cuda/kernel/internal.hpp b/include/RAJA/policy/cuda/kernel/internal.hpp
index 9c904ea45a..e8124cc64a 100644
--- a/include/RAJA/policy/cuda/kernel/internal.hpp
+++ b/include/RAJA/policy/cuda/kernel/internal.hpp
@@ -44,29 +44,26 @@ namespace RAJA
 namespace internal
 {
 
-struct LaunchDims {
+struct LaunchDims
+{
 
   CudaDims dims;
   CudaDims min_dims;
 
-  LaunchDims() = default;
-  LaunchDims(LaunchDims const&) = default;
+  LaunchDims()                             = default;
+  LaunchDims(LaunchDims const&)            = default;
   LaunchDims& operator=(LaunchDims const&) = default;
 
   RAJA_INLINE
-  LaunchDims(CudaDims _dims)
-    : dims{_dims}
-    , min_dims{}
-  { }
+  LaunchDims(CudaDims _dims) : dims {_dims}, min_dims {} {}
 
   RAJA_INLINE
   LaunchDims(CudaDims _dims, CudaDims _min_dims)
-    : dims{_dims}
-    , min_dims{_min_dims}
-  { }
+      : dims {_dims}, min_dims {_min_dims}
+  {}
 
   RAJA_INLINE
-  LaunchDims max(LaunchDims const &c) const
+  LaunchDims max(LaunchDims const& c) const
   {
     LaunchDims result;
 
@@ -82,43 +79,44 @@ struct LaunchDims {
     result.dims.threads.y = std::max(c.dims.threads.y, dims.threads.y);
     result.dims.threads.z = std::max(c.dims.threads.z, dims.threads.z);
 
-    result.min_dims.threads.x = std::max(c.min_dims.threads.x, min_dims.threads.x);
-    result.min_dims.threads.y = std::max(c.min_dims.threads.y, min_dims.threads.y);
-    result.min_dims.threads.z = std::max(c.min_dims.threads.z, min_dims.threads.z);
+    result.min_dims.threads.x =
+        std::max(c.min_dims.threads.x, min_dims.threads.x);
+    result.min_dims.threads.y =
+        std::max(c.min_dims.threads.y, min_dims.threads.y);
+    result.min_dims.threads.z =
+        std::max(c.min_dims.threads.z, min_dims.threads.z);
 
     return result;
   }
 
   RAJA_INLINE
-  int num_blocks() const {
-    return dims.num_blocks();
-  }
+  int num_blocks() const { return dims.num_blocks(); }
 
   RAJA_INLINE
-  int num_threads() const {
-    return dims.num_threads();
-  }
+  int num_threads() const { return dims.num_threads(); }
 
 
   RAJA_INLINE
-  void clamp_to_min_blocks() {
+  void clamp_to_min_blocks()
+  {
     dims.blocks.x = std::max(min_dims.blocks.x, dims.blocks.x);
     dims.blocks.y = std::max(min_dims.blocks.y, dims.blocks.y);
     dims.blocks.z = std::max(min_dims.blocks.z, dims.blocks.z);
   };
 
   RAJA_INLINE
-  void clamp_to_min_threads() {
+  void clamp_to_min_threads()
+  {
     dims.threads.x = std::max(min_dims.threads.x, dims.threads.x);
     dims.threads.y = std::max(min_dims.threads.y, dims.threads.y);
     dims.threads.z = std::max(min_dims.threads.z, dims.threads.z);
   };
-
 };
 
 
 template <camp::idx_t cur_stmt, camp::idx_t num_stmts, typename StmtList>
-struct CudaStatementListExecutorHelper {
+struct CudaStatementListExecutorHelper
+{
 
   using next_helper_t =
       CudaStatementListExecutorHelper<cur_stmt + 1, num_stmts, StmtList>;
@@ -126,7 +124,7 @@ struct CudaStatementListExecutorHelper {
   using cur_stmt_t = camp::at_v<StmtList, cur_stmt>;
 
   template <typename Data>
-  inline static RAJA_DEVICE void exec(Data &data, bool thread_active)
+  inline static RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Execute stmt
     cur_stmt_t::exec(data, thread_active);
@@ -137,7 +135,7 @@ struct CudaStatementListExecutorHelper {
 
 
   template <typename Data>
-  inline static LaunchDims calculateDimensions(Data &data)
+  inline static LaunchDims calculateDimensions(Data& data)
   {
     // Compute this statements launch dimensions
     LaunchDims statement_dims = cur_stmt_t::calculateDimensions(data);
@@ -151,16 +149,17 @@ struct CudaStatementListExecutorHelper {
 };
 
 template <camp::idx_t num_stmts, typename StmtList>
-struct CudaStatementListExecutorHelper<num_stmts, num_stmts, StmtList> {
+struct CudaStatementListExecutorHelper<num_stmts, num_stmts, StmtList>
+{
 
   template <typename Data>
-  inline static RAJA_DEVICE void exec(Data &, bool)
+  inline static RAJA_DEVICE void exec(Data&, bool)
   {
     // nop terminator
   }
 
   template <typename Data>
-  inline static LaunchDims calculateDimensions(Data &)
+  inline static LaunchDims calculateDimensions(Data&)
   {
     return LaunchDims();
   }
@@ -175,109 +174,120 @@ struct CudaStatementListExecutor;
 
 
 template <typename Data, typename... Stmts, typename Types>
-struct CudaStatementListExecutor<Data, StatementList<Stmts...>, Types> {
+struct CudaStatementListExecutor<Data, StatementList<Stmts...>, Types>
+{
 
   using enclosed_stmts_t =
       camp::list<CudaStatementExecutor<Data, Stmts, Types>...>;
 
   static constexpr size_t num_stmts = sizeof...(Stmts);
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Execute statements in order with helper class
-    CudaStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(data, thread_active);
+    CudaStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(
+        data, thread_active);
   }
 
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute this statements launch dimensions
-    return CudaStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::
-        calculateDimensions(data);
+    return CudaStatementListExecutorHelper<
+        0, num_stmts, enclosed_stmts_t>::calculateDimensions(data);
   }
 };
 
 
 template <typename StmtList, typename Data, typename Types>
-using cuda_statement_list_executor_t = CudaStatementListExecutor<
-    Data,
-    StmtList,
-    Types>;
+using cuda_statement_list_executor_t =
+    CudaStatementListExecutor<Data, StmtList, Types>;
 
 
 // specialization for direct sequential policies
-template<typename kernel_indexer>
+template <typename kernel_indexer>
 struct KernelDimensionCalculator;
 
 // specialization for direct sequential policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
 
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims), CudaDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
+  template <typename IdxT>
+  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims),
+                             CudaDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT len)
   {
-    if ( len > static_cast<IdxT>(1) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > static_cast<IdxT>(1))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
   }
 };
 
 // specialization for direct thread policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(len));
     set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(len));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    if ( len > static_cast<IdxT>(IndexMapper::block_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > static_cast<IdxT>(IndexMapper::block_size))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
 // specialization for direct block policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(len));
@@ -285,164 +295,219 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    if ( len > static_cast<IdxT>(IndexMapper::grid_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > static_cast<IdxT>(IndexMapper::grid_size))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 // specialization for direct global policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+  using IndexMapper = cuda::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims), CudaDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
+  template <typename IdxT>
+  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims),
+                             CudaDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT len)
   {
-    if (len > static_cast<IdxT>(0)) {
+    if (len > static_cast<IdxT>(0))
+    {
       RAJA_ABORT_OR_THROW("must know one of block_size or grid_size");
     }
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
-    set_cuda_dim<dim>(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
+    set_cuda_dim<dim>(dims.threads,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::grid_size)));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_cuda_dim<dim>(dims.blocks,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::block_size)));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    if ( len > (static_cast<IdxT>(IndexMapper::block_size) *
-                static_cast<IdxT>(IndexMapper::grid_size)) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > (static_cast<IdxT>(IndexMapper::block_size) *
+               static_cast<IdxT>(IndexMapper::grid_size)))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 
 // specialization for strided loop sequential policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
-
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims), CudaDims& RAJA_UNUSED_ARG(min_dims), IdxT RAJA_UNUSED_ARG(len))
-  {
-  }
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
+
+  template <typename IdxT>
+  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims),
+                             CudaDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT RAJA_UNUSED_ARG(len))
+  {}
 };
 
 // specialization for strided loop thread policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(len));
     set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template <typename IdxT>
+  static void
+  set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
 // specialization for strided loop block policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(len));
@@ -450,35 +515,43 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template <typename IdxT>
+  static void
+  set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 // specialization for strided loop global policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+  using IndexMapper = cuda::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    if (len > static_cast<IdxT>(0)) {
+    if (len > static_cast<IdxT>(0))
+    {
       set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(1));
       set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(1));
       set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
@@ -487,62 +560,88 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
-    set_cuda_dim<dim>(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
+    set_cuda_dim<dim>(dims.threads,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
     set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(dims.blocks,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(1));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template <typename IdxT>
+  static void
+  set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index 75e5f6902b..355b080a30 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -35,9 +35,9 @@ __global__ void launch_global_fcn(BODY body_in)
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto& body      = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
@@ -45,38 +45,47 @@ __global__ void launch_global_fcn(BODY body_in)
 }
 
 template <typename BODY, typename ReduceParams>
-__global__ void launch_new_reduce_global_fcn(BODY body_in, ReduceParams reduce_params)
+__global__ void launch_new_reduce_global_fcn(BODY body_in,
+                                             ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto& body      = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
-  RAJA::expt::invoke_body( reduce_params, body, ctx );
+  RAJA::expt::invoke_body(reduce_params, body, ctx);
 
-  //Using a flatten global policy as we may use all dimensions
-  RAJA::expt::ParamMultiplexer::combine<RAJA::cuda_flatten_global_xyz_direct>(reduce_params);
+  // Using a flatten global policy as we may use all dimensions
+  RAJA::expt::ParamMultiplexer::combine<RAJA::cuda_flatten_global_xyz_direct>(
+      reduce_params);
 }
 
 template <bool async>
-struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usage::unspecified, named_usage::unspecified>> {
+struct LaunchExecute<
+    RAJA::policy::cuda::cuda_launch_explicit_t<async,
+                                               named_usage::unspecified,
+                                               named_usage::unspecified>>
+{
 
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = reinterpret_cast<const void*>(
-        &launch_global_fcn<BODY>);
+    auto func = reinterpret_cast<const void*>(&launch_global_fcn<BODY>);
 
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
 
@@ -84,18 +93,20 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(params.teams.value[0]),
+    cuda_dim_t gridSize {static_cast<cuda_dim_member_t>(params.teams.value[0]),
                          static_cast<cuda_dim_member_t>(params.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(params.teams.value[2]) };
+                         static_cast<cuda_dim_member_t>(params.teams.value[2])};
 
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(params.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(params.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(params.threads.value[2]) };
+    cuda_dim_t blockSize {
+        static_cast<cuda_dim_member_t>(params.threads.value[0]),
+        static_cast<cuda_dim_member_t>(params.threads.value[1]),
+        static_cast<cuda_dim_member_t>(params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
@@ -105,14 +116,16 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::cuda::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::cuda::make_launch_body(
+            func, gridSize, blockSize, shared_mem_size, cuda_res,
+            std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name);
+        void* args[] = {(void*)&body};
+        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
+                           cuda_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -121,13 +134,18 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  //Version with explicit reduction parameters..
+  // Version with explicit reduction parameters..
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, const LaunchParams &launch_params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &launch_reducers)
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& launch_reducers)
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -140,46 +158,54 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(launch_params.teams.value[0]),
-                         static_cast<cuda_dim_member_t>(launch_params.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(launch_params.teams.value[2]) };
+    cuda_dim_t gridSize {
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[0]),
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[1]),
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[2])};
 
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(launch_params.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(launch_params.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(launch_params.threads.value[2]) };
+    cuda_dim_t blockSize {
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[0]),
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[1]),
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
       size_t shared_mem_size = launch_params.shared_mem_size;
       RAJA::cuda::detail::cudaInfo launch_info;
-      launch_info.gridDim = gridSize;
-      launch_info.blockDim = blockSize;
+      launch_info.gridDim      = gridSize;
+      launch_info.blockDim     = blockSize;
       launch_info.dynamic_smem = &shared_mem_size;
-      launch_info.res = cuda_res;
+      launch_info.res          = cuda_res;
 
       {
-        using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usage::unspecified, named_usage::unspecified>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers, launch_info);
+        using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t<
+            async, named_usage::unspecified, named_usage::unspecified>;
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
+                                                     launch_info);
 
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::cuda::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::cuda::make_launch_body(
+            func, gridSize, blockSize, shared_mem_size, cuda_res,
+            std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name);
+        void* args[] = {(void*)&body, (void*)&launch_reducers};
+        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
+                           cuda_res, async, kernel_name);
 
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers, launch_info);
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
+                                                        launch_info);
       }
 
       RAJA_FT_END;
@@ -187,56 +213,66 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
 
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
 
 template <typename BODY, int num_threads, size_t BLOCKS_PER_SM>
 __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
-void launch_global_fcn_fixed(BODY body_in)
+    void launch_global_fcn_fixed(BODY body_in)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto& body      = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
   body(ctx);
 }
 
-template <typename BODY, int num_threads, size_t BLOCKS_PER_SM, typename ReduceParams>
+template <typename BODY,
+          int num_threads,
+          size_t BLOCKS_PER_SM,
+          typename ReduceParams>
 __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
-void launch_new_reduce_global_fcn_fixed(BODY body_in, ReduceParams reduce_params)
+    void launch_new_reduce_global_fcn_fixed(BODY body_in,
+                                            ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto& body      = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
-  RAJA::expt::invoke_body( reduce_params, body, ctx );
+  RAJA::expt::invoke_body(reduce_params, body, ctx);
 
-  //Using a flatten global policy as we may use all dimensions
-  RAJA::expt::ParamMultiplexer::combine<RAJA::cuda_flatten_global_xyz_direct>(reduce_params);
+  // Using a flatten global policy as we may use all dimensions
+  RAJA::expt::ParamMultiplexer::combine<RAJA::cuda_flatten_global_xyz_direct>(
+      reduce_params);
 }
 
 template <bool async, int nthreads, size_t BLOCKS_PER_SM>
-struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>> {
+struct LaunchExecute<
+    RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>>
+{
 
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
 
     using BODY = camp::decay<BODY_IN>;
@@ -250,18 +286,20 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(params.teams.value[0]),
+    cuda_dim_t gridSize {static_cast<cuda_dim_member_t>(params.teams.value[0]),
                          static_cast<cuda_dim_member_t>(params.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(params.teams.value[2]) };
+                         static_cast<cuda_dim_member_t>(params.teams.value[2])};
 
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(params.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(params.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(params.threads.value[2]) };
+    cuda_dim_t blockSize {
+        static_cast<cuda_dim_member_t>(params.threads.value[0]),
+        static_cast<cuda_dim_member_t>(params.threads.value[1]),
+        static_cast<cuda_dim_member_t>(params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
@@ -271,14 +309,16 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::cuda::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::cuda::make_launch_body(
+            func, gridSize, blockSize, shared_mem_size, cuda_res,
+            std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name);
+        void* args[] = {(void*)&body};
+        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
+                           cuda_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -287,19 +327,25 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  //Version with explicit reduction parameters..
-  template<typename BODY_IN, typename ReduceParams>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, const LaunchParams &launch_params,
-       const char *kernel_name, BODY_IN && body_in, ReduceParams &launch_reducers)
+  // Version with explicit reduction parameters..
+  template <typename BODY_IN, typename ReduceParams>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& launch_reducers)
   {
 
     using BODY = camp::decay<BODY_IN>;
 
     auto func = reinterpret_cast<const void*>(
-        &launch_new_reduce_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM, camp::decay<ReduceParams>>);
+        &launch_new_reduce_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM,
+                                            camp::decay<ReduceParams>>);
 
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
 
@@ -307,53 +353,61 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(launch_params.teams.value[0]),
-                         static_cast<cuda_dim_member_t>(launch_params.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(launch_params.teams.value[2]) };
+    cuda_dim_t gridSize {
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[0]),
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[1]),
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[2])};
 
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(launch_params.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(launch_params.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(launch_params.threads.value[2]) };
+    cuda_dim_t blockSize {
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[0]),
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[1]),
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[2])};
 
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
       size_t shared_mem_size = launch_params.shared_mem_size;
       RAJA::cuda::detail::cudaInfo launch_info;
-      launch_info.gridDim = gridSize;
-      launch_info.blockDim = blockSize;
+      launch_info.gridDim      = gridSize;
+      launch_info.blockDim     = blockSize;
       launch_info.dynamic_smem = &shared_mem_size;
-      launch_info.res = cuda_res;
+      launch_info.res          = cuda_res;
       {
 
-        using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers, launch_info);
+        using EXEC_POL =
+            RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
+                                                       BLOCKS_PER_SM>;
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
+                                                     launch_info);
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::cuda::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::cuda::make_launch_body(
+            func, gridSize, blockSize, shared_mem_size, cuda_res,
+            std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name);
+        void* args[] = {(void*)&body, (void*)&launch_reducers};
+        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
+                           cuda_res, async, kernel_name);
 
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers, launch_info);
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
+                                                        launch_info);
       }
 
       RAJA_FT_END;
     }
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
 
@@ -361,43 +415,50 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
    CUDA generic loop implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE
-  void exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-            SEGMENT const &segment,
-            BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -405,29 +466,36 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
     const diff_t i0 = IndexMapper0::template index<diff_t>();
     const diff_t i1 = IndexMapper1::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1) {
+    if (i0 < len0 && i1 < len1)
+    {
       body(*(segment0.begin() + i0), *(segment1.begin() + i1));
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1,
-                                                  IndexMapper2>,
-                   SEGMENT> {
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1,
+                                     IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -437,53 +505,62 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
     const diff_t i1 = IndexMapper1::template index<diff_t>();
     const diff_t i2 = IndexMapper2::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1 && i2 < len2) {
-      body(*(segment0.begin() + i0),
-           *(segment1.begin() + i1),
+    if (i0 < len0 && i1 < len1 && i2 < len2)
+    {
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1),
            *(segment2.begin() + i2));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE
-  void exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-            SEGMENT const &segment,
-            BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -494,34 +571,42 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        body(*(segment0.begin() + i0),
-             *(segment1.begin() + i1));
+        body(*(segment0.begin() + i0), *(segment1.begin() + i1));
       }
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1,
-                                                  IndexMapper2>,
-                   SEGMENT> {
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -535,14 +620,16 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
     const diff_t i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride) {
+        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
+        {
 
-          body(*(segment0.begin() + i0),
-               *(segment1.begin() + i1),
+          body(*(segment0.begin() + i0), *(segment1.begin() + i1),
                *(segment2.begin() + i2));
         }
       }
@@ -551,42 +638,49 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i), i);
     }
   }
 };
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -594,31 +688,36 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
     const diff_t i0 = IndexMapper0::template index<diff_t>();
     const diff_t i1 = IndexMapper1::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1) {
-      body(*(segment0.begin() + i0),
-           *(segment1.begin() + i1),
-           i0, i1);
+    if (i0 < len0 && i1 < len1)
+    {
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1), i0, i1);
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1,
-                                                        IndexMapper2>,
-                         SEGMENT> {
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1,
+                                     IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -628,54 +727,62 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
     const diff_t i1 = IndexMapper1::template index<diff_t>();
     const diff_t i2 = IndexMapper2::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1 && i2 < len2) {
-      body(*(segment0.begin() + i0),
-           *(segment1.begin() + i1),
-           *(segment2.begin() + i2),
-           i0, i1, i2);
+    if (i0 < len0 && i1 < len1 && i2 < len2)
+    {
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1),
+           *(segment2.begin() + i2), i0, i1, i2);
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
       body(*(segment.begin() + i), i);
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -686,35 +793,42 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        body(*(segment0.begin() + i0),
-             *(segment1.begin() + i1),
-             i0, i1);
+        body(*(segment0.begin() + i0), *(segment1.begin() + i1), i0, i1);
       }
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1,
-                                                        IndexMapper2>,
-                         SEGMENT> {
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -728,16 +842,17 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
     const diff_t i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride) {
+        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
+        {
 
-          body(*(segment0.begin() + i0),
-               *(segment1.begin() + i1),
-               *(segment2.begin() + i2),
-               i0, i1, i2);
+          body(*(segment0.begin() + i0), *(segment1.begin() + i1),
+               *(segment2.begin() + i2), i0, i1, i2);
         }
       }
     }
@@ -748,31 +863,34 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
 /*
    CUDA generic flattened loop implementations
 */
-template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          sync,
-                                                          IndexMapper0>,
-                   SEGMENT>
-    :  LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                  sync,
-                                                  IndexMapper0>,
-                   SEGMENT>
+template <typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                             sync,
+                                             IndexMapper0>,
+    SEGMENT>
+    : LoopExecute<
+          RAJA::policy::cuda::
+              cuda_indexer<RAJA::iteration_mapping::Direct, sync, IndexMapper0>,
+          SEGMENT>
 {};
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1>,
-                   SEGMENT>
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                             kernel_sync_requirement::none,
+                                             IndexMapper0,
+                                             IndexMapper1>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -781,29 +899,35 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
 
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
 
-    const int i = i0 + i0_stride*i1;
+    const int i = i0 + i0_stride * i1;
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1,
-                                                          IndexMapper2>,
-                   SEGMENT>
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                             kernel_sync_requirement::none,
+                                             IndexMapper0,
+                                             IndexMapper1,
+                                             IndexMapper2>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -814,39 +938,47 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    const int i = i0 + i0_stride*(i1 + i1_stride*i2);
+    const int i = i0 + i0_stride * (i1 + i1_stride * i2);
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          sync,
-                                                          IndexMapper0>,
-                   SEGMENT>
-    :  LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  sync,
-                                                  IndexMapper0>,
-                   SEGMENT>
+template <typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        sync,
+        IndexMapper0>,
+    SEGMENT>
+    : LoopExecute<
+          RAJA::policy::cuda::cuda_indexer<
+              RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+              sync,
+              IndexMapper0>,
+          SEGMENT>
 {};
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1>,
-                   SEGMENT>
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -856,29 +988,34 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
     const int i0_stride = IndexMapper0::template size<diff_t>();
     const int i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (int i = i0 + i0_stride*i1;
-         i < len;
-         i += i0_stride*i1_stride) {
+    for (int i = i0 + i0_stride * i1; i < len; i += i0_stride * i1_stride)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1,
-                                                          IndexMapper2>,
-                   SEGMENT>
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -890,9 +1027,9 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
     const int i1_stride = IndexMapper1::template size<diff_t>();
     const int i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (int i = i0 + i0_stride*(i1 + i1_stride*i2);
-         i < len;
-         i += i0_stride*i1_stride*i2_stride) {
+    for (int i = i0 + i0_stride * (i1 + i1_stride * i2); i < len;
+         i += i0_stride * i1_stride * i2_stride)
+    {
       body(*(segment.begin() + i));
     }
   }
@@ -903,101 +1040,122 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
    CUDA generic tile implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct TileExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
 
-    if (i < len) {
+    if (i < len)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct TileExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(tile_size);
 
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct TileTCountExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t t = IndexMapper::template index<diff_t>();
-    const diff_t i = t * static_cast<diff_t>(tile_size);
+    const diff_t t   = IndexMapper::template index<diff_t>();
+    const diff_t i   = t * static_cast<diff_t>(tile_size);
 
-    if (i < len) {
+    if (i < len)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)), t);
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct TileTCountExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t t_init = IndexMapper::template index<diff_t>();
-    const diff_t i_init = t_init * static_cast<diff_t>(tile_size);
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t t_init   = IndexMapper::template index<diff_t>();
+    const diff_t i_init   = t_init * static_cast<diff_t>(tile_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(tile_size);
 
-    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
+    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)), t);
     }
   }
diff --git a/include/RAJA/policy/cuda/multi_reduce.hpp b/include/RAJA/policy/cuda/multi_reduce.hpp
index f9f60f730e..e52a036c8f 100644
--- a/include/RAJA/policy/cuda/multi_reduce.hpp
+++ b/include/RAJA/policy/cuda/multi_reduce.hpp
@@ -46,9 +46,9 @@
 #include "RAJA/policy/cuda/intrinsics.hpp"
 
 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
-  #include "RAJA/policy/desul/atomic.hpp"
+#include "RAJA/policy/desul/atomic.hpp"
 #else
-  #include "RAJA/policy/cuda/atomic.hpp"
+#include "RAJA/policy/cuda/atomic.hpp"
 #endif
 
 #include "RAJA/policy/cuda/policy.hpp"
@@ -73,100 +73,124 @@ namespace impl
 //
 
 //! combine value into global memory
-template <typename Combiner, typename GetTallyIndex,
-          typename T, typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
-                                                                      T identity,
-                                                                      int bin,
-                                                                      T value,
-                                                                      T* tally_mem,
-                                                                      GetTallyOffset get_tally_offset,
-                                                                      int tally_replication,
-                                                                      int tally_bins)
+template <typename Combiner,
+          typename GetTallyIndex,
+          typename T,
+          typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void
+block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
+                                         T identity,
+                                         int bin,
+                                         T value,
+                                         T* tally_mem,
+                                         GetTallyOffset get_tally_offset,
+                                         int tally_replication,
+                                         int tally_bins)
 {
-  if (value == identity) { return; }
+  if (value == identity)
+  {
+    return;
+  }
 
-  int tally_index = GetTallyIndex::template index<int>(); // globalWarpId by default
+  int tally_index =
+      GetTallyIndex::template index<int>();  // globalWarpId by default
   int tally_rep = ::RAJA::power_of_2_mod(tally_index, tally_replication);
-  int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
-  RAJA::reduce::cuda::atomic<Combiner>{}(tally_mem[tally_offset], value);
+  int tally_offset =
+      get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
+  RAJA::reduce::cuda::atomic<Combiner> {}(tally_mem[tally_offset], value);
 }
 
 
 //! initialize shared memory
 template <typename T>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(int num_bins,
-                                                           T identity,
-                                                           T* shared_mem,
-                                                           int shared_replication)
+RAJA_DEVICE RAJA_INLINE void
+block_multi_reduce_init_shmem(int num_bins,
+                              T identity,
+                              T* shared_mem,
+                              int shared_replication)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
 
   for (int shmem_offset = threadId;
-       shmem_offset < shared_replication * num_bins;
-       shmem_offset += numThreads) {
+       shmem_offset < shared_replication * num_bins; shmem_offset += numThreads)
+  {
     shared_mem[shmem_offset] = identity;
   }
   __syncthreads();
 }
 
 //! combine value into shared memory
-template <typename Combiner, typename GetSharedIndex,
-          typename T, typename GetSharedOffset>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_shmem_atomic(int num_bins,
-                                                                     T identity,
-                                                                     int bin,
-                                                                     T value,
-                                                                     T* shared_mem,
-                                                                     GetSharedOffset get_shared_offset,
-                                                                     int shared_replication)
+template <typename Combiner,
+          typename GetSharedIndex,
+          typename T,
+          typename GetSharedOffset>
+RAJA_DEVICE RAJA_INLINE void
+block_multi_reduce_combine_shmem_atomic(int num_bins,
+                                        T identity,
+                                        int bin,
+                                        T value,
+                                        T* shared_mem,
+                                        GetSharedOffset get_shared_offset,
+                                        int shared_replication)
 {
-  if (value == identity) { return; }
+  if (value == identity)
+  {
+    return;
+  }
 
-  int shared_index = GetSharedIndex::template index<int>(); // threadId by default
+  int shared_index =
+      GetSharedIndex::template index<int>();  // threadId by default
   int shared_rep = ::RAJA::power_of_2_mod(shared_index, shared_replication);
-  int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication);
+  int shmem_offset =
+      get_shared_offset(bin, num_bins, shared_rep, shared_replication);
 
-  RAJA::reduce::cuda::atomic<Combiner>{}(shared_mem[shmem_offset], value);
+  RAJA::reduce::cuda::atomic<Combiner> {}(shared_mem[shmem_offset], value);
 }
 
 //! combine value into shared memory
 template <typename Combiner,
-          typename T, typename GetSharedOffset, typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(int num_bins,
-                                                                      T identity,
-                                                                      T* shared_mem,
-                                                                      GetSharedOffset get_shared_offset,
-                                                                      int shared_replication,
-                                                                      T* tally_mem,
-                                                                      GetTallyOffset get_tally_offset,
-                                                                      int tally_replication,
-                                                                      int tally_bins)
+          typename T,
+          typename GetSharedOffset,
+          typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void
+grid_multi_reduce_shmem_to_global_atomic(int num_bins,
+                                         T identity,
+                                         T* shared_mem,
+                                         GetSharedOffset get_shared_offset,
+                                         int shared_replication,
+                                         T* tally_mem,
+                                         GetTallyOffset get_tally_offset,
+                                         int tally_replication,
+                                         int tally_bins)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
 
   int blockId = blockIdx.x + gridDim.x * blockIdx.y +
-                 (gridDim.x * gridDim.y) * blockIdx.z;
+                (gridDim.x * gridDim.y) * blockIdx.z;
 
   __syncthreads();
-  for (int bin = threadId; bin < num_bins; bin += numThreads) {
+  for (int bin = threadId; bin < num_bins; bin += numThreads)
+  {
 
     T value = identity;
-    for (int shared_rep = 0; shared_rep < shared_replication; ++shared_rep) {
-      int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication);
-      Combiner{}(value, shared_mem[shmem_offset]);
+    for (int shared_rep = 0; shared_rep < shared_replication; ++shared_rep)
+    {
+      int shmem_offset =
+          get_shared_offset(bin, num_bins, shared_rep, shared_replication);
+      Combiner {}(value, shared_mem[shmem_offset]);
     }
 
-    if (value != identity) {
+    if (value != identity)
+    {
       int tally_rep = ::RAJA::power_of_2_mod(blockId, tally_replication);
-      int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
-      RAJA::reduce::cuda::atomic<Combiner>{}(tally_mem[tally_offset], value);
+      int tally_offset =
+          get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
+      RAJA::reduce::cuda::atomic<Combiner> {}(tally_mem[tally_offset], value);
     }
-
   }
 }
 
@@ -185,48 +209,63 @@ template <typename Combiner, typename T, typename tuning>
 struct MultiReduceGridAtomicHostInit_TallyData
 {
   //! setup permanent settings, allocate and initialize tally memory
-  template < typename Container >
-  MultiReduceGridAtomicHostInit_TallyData(Container const& container, T const& identity)
-      : m_tally_mem(nullptr)
-      , m_identity(identity)
-      , m_num_bins(container.size())
-      , m_tally_bins(get_tally_bins(m_num_bins))
-      , m_tally_replication(get_tally_replication())
+  template <typename Container>
+  MultiReduceGridAtomicHostInit_TallyData(Container const& container,
+                                          T const& identity)
+      : m_tally_mem(nullptr),
+        m_identity(identity),
+        m_num_bins(container.size()),
+        m_tally_bins(get_tally_bins(m_num_bins)),
+        m_tally_replication(get_tally_replication())
   {
-    m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication);
+    m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
+                               m_tally_replication);
   }
 
   MultiReduceGridAtomicHostInit_TallyData() = delete;
-  MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData const&) = default;
-  MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData &&) = delete;
-  MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData const&) = default;
-  MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData &&) = delete;
-  ~MultiReduceGridAtomicHostInit_TallyData() = default;
+  MultiReduceGridAtomicHostInit_TallyData(
+      MultiReduceGridAtomicHostInit_TallyData const&) = default;
+  MultiReduceGridAtomicHostInit_TallyData(
+      MultiReduceGridAtomicHostInit_TallyData&&) = delete;
+  MultiReduceGridAtomicHostInit_TallyData&
+  operator=(MultiReduceGridAtomicHostInit_TallyData const&) = default;
+  MultiReduceGridAtomicHostInit_TallyData&
+  operator=(MultiReduceGridAtomicHostInit_TallyData&&) = delete;
+  ~MultiReduceGridAtomicHostInit_TallyData()           = default;
 
 
   //! reset permanent settings, reallocate and reset tally memory
-  template < typename Container >
+  template <typename Container>
   void reset_permanent(Container const& container, T const& identity)
   {
     int new_num_bins = container.size();
-    if (new_num_bins != m_num_bins) {
+    if (new_num_bins != m_num_bins)
+    {
       teardown_permanent();
-      m_num_bins = new_num_bins;
-      m_tally_bins = get_tally_bins(m_num_bins);
+      m_num_bins          = new_num_bins;
+      m_tally_bins        = get_tally_bins(m_num_bins);
       m_tally_replication = get_tally_replication();
-      m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication);
-    } else {
+      m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
+                                 m_tally_replication);
+    }
+    else
+    {
       {
         int tally_rep = 0;
-        int bin = 0;
-        for (auto const& value : container) {
-          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = value;
+        int bin       = 0;
+        for (auto const& value : container)
+        {
+          m_tally_mem[GetTallyOffset {}(bin, m_tally_bins, tally_rep,
+                                        m_tally_replication)] = value;
           ++bin;
         }
       }
-      for (int tally_rep = 1; tally_rep < m_tally_replication; ++tally_rep) {
-        for (int bin = 0; bin < m_num_bins; ++bin) {
-          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = identity;
+      for (int tally_rep = 1; tally_rep < m_tally_replication; ++tally_rep)
+      {
+        for (int bin = 0; bin < m_num_bins; ++bin)
+        {
+          m_tally_mem[GetTallyOffset {}(bin, m_tally_bins, tally_rep,
+                                        m_tally_replication)] = identity;
         }
       }
     }
@@ -244,9 +283,11 @@ struct MultiReduceGridAtomicHostInit_TallyData
   T get(int bin) const
   {
     ::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
-          reducer(m_identity);
-    for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep) {
-      int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+        reducer(m_identity);
+    for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep)
+    {
+      int tally_offset =
+          GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
       reducer.combine(m_tally_mem[tally_offset]);
     }
     return reducer.get_and_clear();
@@ -258,20 +299,27 @@ struct MultiReduceGridAtomicHostInit_TallyData
   T identity() const { return m_identity; }
 
 private:
-  static constexpr size_t s_tally_alignment = std::max(size_t(policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE),
-                                                       size_t(RAJA::DATA_ALIGN));
-  static constexpr size_t s_tally_bunch_size = RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T));
+  static constexpr size_t s_tally_alignment = std::max(
+      size_t(
+          policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE),
+      size_t(RAJA::DATA_ALIGN));
+  static constexpr size_t s_tally_bunch_size =
+      RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T));
 
   using tally_mempool_type = device_pinned_mempool_type;
-  using tally_tuning = typename tuning::GlobalAtomicReplicationTuning;
-  using TallyAtomicReplicationConcretizer = typename tally_tuning::AtomicReplicationConcretizer;
+  using tally_tuning       = typename tuning::GlobalAtomicReplicationTuning;
+  using TallyAtomicReplicationConcretizer =
+      typename tally_tuning::AtomicReplicationConcretizer;
   using GetTallyOffset_rebind_rebunch = typename tally_tuning::OffsetCalculator;
-  using GetTallyOffset_rebind = typename GetTallyOffset_rebind_rebunch::template rebunch<s_tally_bunch_size>;
+  using GetTallyOffset_rebind =
+      typename GetTallyOffset_rebind_rebunch::template rebunch<
+          s_tally_bunch_size>;
 
 
   static int get_tally_bins(int num_bins)
   {
-    return RAJA_DIVIDE_CEILING_INT(num_bins, s_tally_bunch_size) * s_tally_bunch_size;
+    return RAJA_DIVIDE_CEILING_INT(num_bins, s_tally_bunch_size) *
+           s_tally_bunch_size;
   }
 
   static int get_tally_replication()
@@ -281,39 +329,50 @@ struct MultiReduceGridAtomicHostInit_TallyData
     min_tally_replication = omp_get_max_threads();
 #endif
 
-    struct {
+    struct
+    {
       int func_min_global_replication;
-    } func_data{min_tally_replication};
+    } func_data {min_tally_replication};
 
-    return TallyAtomicReplicationConcretizer{}.template
-        get_global_replication<int>(func_data);
+    return TallyAtomicReplicationConcretizer {}
+        .template get_global_replication<int>(func_data);
   }
 
-  template < typename Container >
-  static T* create_tally(Container const& container, T const& identity,
-                         int num_bins, int tally_bins, int tally_replication)
+  template <typename Container>
+  static T* create_tally(Container const& container,
+                         T const& identity,
+                         int num_bins,
+                         int tally_bins,
+                         int tally_replication)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return nullptr;
     }
 
     T* tally_mem = tally_mempool_type::getInstance().template malloc<T>(
-        tally_replication*tally_bins, s_tally_alignment);
+        tally_replication * tally_bins, s_tally_alignment);
 
-    if (tally_replication > 0) {
+    if (tally_replication > 0)
+    {
       {
         int tally_rep = 0;
-        int bin = 0;
-        for (auto const& value : container) {
-          int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
-          new(&tally_mem[tally_offset]) T(value);
+        int bin       = 0;
+        for (auto const& value : container)
+        {
+          int tally_offset =
+              GetTallyOffset {}(bin, tally_bins, tally_rep, tally_replication);
+          new (&tally_mem[tally_offset]) T(value);
           ++bin;
         }
       }
-      for (int tally_rep = 1; tally_rep < tally_replication; ++tally_rep) {
-        for (int bin = 0; bin < num_bins; ++bin) {
-          int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
-          new(&tally_mem[tally_offset]) T(identity);
+      for (int tally_rep = 1; tally_rep < tally_replication; ++tally_rep)
+      {
+        for (int bin = 0; bin < num_bins; ++bin)
+        {
+          int tally_offset =
+              GetTallyOffset {}(bin, tally_bins, tally_rep, tally_replication);
+          new (&tally_mem[tally_offset]) T(identity);
         }
       }
     }
@@ -321,15 +380,21 @@ struct MultiReduceGridAtomicHostInit_TallyData
   }
 
   static void destroy_tally(T*& tally_mem,
-                            int num_bins, int tally_bins, int tally_replication)
+                            int num_bins,
+                            int tally_bins,
+                            int tally_replication)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return;
     }
 
-    for (int tally_rep = tally_replication+1; tally_rep > 0; --tally_rep) {
-      for (int bin = num_bins; bin > 0; --bin) {
-        int tally_offset = GetTallyOffset{}(bin-1, tally_bins, tally_rep-1, tally_replication);
+    for (int tally_rep = tally_replication + 1; tally_rep > 0; --tally_rep)
+    {
+      for (int bin = num_bins; bin > 0; --bin)
+      {
+        int tally_offset = GetTallyOffset {}(bin - 1, tally_bins, tally_rep - 1,
+                                             tally_replication);
         tally_mem[tally_offset].~T();
       }
     }
@@ -338,14 +403,15 @@ struct MultiReduceGridAtomicHostInit_TallyData
   }
 
 protected:
-  using GetTallyIndex = typename tally_tuning::ReplicationIndexer;
+  using GetTallyIndex  = typename tally_tuning::ReplicationIndexer;
   using GetTallyOffset = typename GetTallyOffset_rebind::template rebind<int>;
 
   T* m_tally_mem;
   T m_identity;
   int m_num_bins;
   int m_tally_bins;
-  int m_tally_replication; // power of 2, at least the max number of omp threads
+  int m_tally_replication;  // power of 2, at least the max number of omp
+                            // threads
 };
 
 
@@ -354,34 +420,31 @@ template <typename Combiner, typename T, typename tuning>
 struct MultiReduceGridAtomicHostInit_Data
     : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
 {
-  using TallyData = MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
+  using TallyData =
+      MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
 
   //! defer to tally data for some functions
-  using TallyData::TallyData;
-  using TallyData::reset_permanent;
-  using TallyData::teardown_permanent;
   using TallyData::get;
-  using TallyData::num_bins;
   using TallyData::identity;
+  using TallyData::num_bins;
+  using TallyData::reset_permanent;
+  using TallyData::TallyData;
+  using TallyData::teardown_permanent;
 
   //! setup per launch, do nothing
-  void setup_launch(size_t RAJA_UNUSED_ARG(block_size))
-  { }
+  void setup_launch(size_t RAJA_UNUSED_ARG(block_size)) {}
 
   //! teardown per launch, do nothing
-  void teardown_launch()
-  { }
+  void teardown_launch() {}
 
 
   //! setup on device, do nothing
   RAJA_DEVICE
-  void setup_device()
-  { }
+  void setup_device() {}
 
   //! finalize on device, do nothing
   RAJA_DEVICE
-  void finalize_device()
-  { }
+  void finalize_device() {}
 
 
   //! combine value on device, combine a value into the tally atomically
@@ -389,9 +452,8 @@ struct MultiReduceGridAtomicHostInit_Data
   void combine_device(int bin, T value)
   {
     impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-        m_num_bins, m_identity,
-        bin, value,
-        m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+        m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset {},
+        m_tally_replication, m_tally_bins);
   }
 
   //! combine value on host, combine a value into the tally
@@ -401,18 +463,19 @@ struct MultiReduceGridAtomicHostInit_Data
 #if defined(RAJA_ENABLE_OPENMP)
     tally_rep = omp_get_thread_num();
 #endif
-    int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
-    Combiner{}(m_tally_mem[tally_offset], value);
+    int tally_offset =
+        GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    Combiner {}(m_tally_mem[tally_offset], value);
   }
 
 private:
   using typename TallyData::GetTallyIndex;
   using typename TallyData::GetTallyOffset;
 
-  using TallyData::m_tally_mem;
   using TallyData::m_identity;
   using TallyData::m_num_bins;
   using TallyData::m_tally_bins;
+  using TallyData::m_tally_mem;
   using TallyData::m_tally_replication;
 };
 
@@ -422,57 +485,69 @@ template <typename Combiner, typename T, typename tuning>
 struct MultiReduceBlockThenGridAtomicHostInit_Data
     : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
 {
-  using TallyData = MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
+  using TallyData =
+      MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
 
   //! setup permanent settings, defer to tally data
-  template < typename Container >
-  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container, T const& identity)
-      : TallyData(container, identity)
-      , m_shared_offset(s_shared_offset_unknown)
-      , m_shared_replication(0)
-  { }
+  template <typename Container>
+  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container,
+                                              T const& identity)
+      : TallyData(container, identity),
+        m_shared_offset(s_shared_offset_unknown),
+        m_shared_replication(0)
+  {}
 
   MultiReduceBlockThenGridAtomicHostInit_Data() = delete;
-  MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
-  MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete;
-  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
-  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete;
-  ~MultiReduceBlockThenGridAtomicHostInit_Data() = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data(
+      MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data(
+      MultiReduceBlockThenGridAtomicHostInit_Data&&) = delete;
+  MultiReduceBlockThenGridAtomicHostInit_Data&
+  operator=(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data&
+  operator=(MultiReduceBlockThenGridAtomicHostInit_Data&&) = delete;
+  ~MultiReduceBlockThenGridAtomicHostInit_Data()           = default;
 
 
   //! defer to tally data for some functions
-  using TallyData::reset_permanent;
-  using TallyData::teardown_permanent;
   using TallyData::get;
-  using TallyData::num_bins;
   using TallyData::identity;
+  using TallyData::num_bins;
+  using TallyData::reset_permanent;
+  using TallyData::teardown_permanent;
 
   //! setup per launch, setup shared memory parameters
   void setup_launch(size_t block_size)
   {
-    if (m_num_bins == size_t(0)) {
+    if (m_num_bins == size_t(0))
+    {
       m_shared_offset = s_shared_offset_invalid;
       return;
     }
 
-    size_t shared_replication = 0;
+    size_t shared_replication  = 0;
     const size_t shared_offset = allocateDynamicShmem<T>(
-        [&](size_t max_shmem_size) {
-
-      struct {
-        size_t func_threads_per_block;
-        size_t func_max_shared_replication_per_block;
-      } func_data{block_size, max_shmem_size / m_num_bins};
-
-      shared_replication = SharedAtomicReplicationConcretizer{}.template
-          get_shared_replication<size_t>(func_data);
-      return m_num_bins * shared_replication;
-    });
-
-    if (shared_offset != dynamic_smem_allocation_failure) {
+        [&](size_t max_shmem_size)
+        {
+          struct
+          {
+            size_t func_threads_per_block;
+            size_t func_max_shared_replication_per_block;
+          } func_data {block_size, max_shmem_size / m_num_bins};
+
+          shared_replication =
+              SharedAtomicReplicationConcretizer {}
+                  .template get_shared_replication<size_t>(func_data);
+          return m_num_bins * shared_replication;
+        });
+
+    if (shared_offset != dynamic_smem_allocation_failure)
+    {
       m_shared_replication = static_cast<int>(shared_replication);
-      m_shared_offset = static_cast<int>(shared_offset);
-    } else {
+      m_shared_offset      = static_cast<int>(shared_offset);
+    }
+    else
+    {
       m_shared_offset = s_shared_offset_invalid;
     }
   }
@@ -481,7 +556,7 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void teardown_launch()
   {
     m_shared_replication = 0;
-    m_shared_offset = s_shared_offset_unknown;
+    m_shared_offset      = s_shared_offset_unknown;
   }
 
 
@@ -490,10 +565,10 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void setup_device()
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr) {
-      impl::block_multi_reduce_init_shmem(
-          m_num_bins, m_identity,
-          shared_mem, m_shared_replication);
+    if (shared_mem != nullptr)
+    {
+      impl::block_multi_reduce_init_shmem(m_num_bins, m_identity, shared_mem,
+                                          m_shared_replication);
     }
   }
 
@@ -502,11 +577,12 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void finalize_device()
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr) {
+    if (shared_mem != nullptr)
+    {
       impl::grid_multi_reduce_shmem_to_global_atomic<Combiner>(
-          m_num_bins, m_identity,
-          shared_mem, GetSharedOffset{}, m_shared_replication,
-          m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+          m_num_bins, m_identity, shared_mem, GetSharedOffset {},
+          m_shared_replication, m_tally_mem, GetTallyOffset {},
+          m_tally_replication, m_tally_bins);
     }
   }
 
@@ -516,16 +592,17 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void combine_device(int bin, T value)
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr) {
+    if (shared_mem != nullptr)
+    {
       impl::block_multi_reduce_combine_shmem_atomic<Combiner, GetSharedIndex>(
-          m_num_bins, m_identity,
-          bin, value,
-          shared_mem, GetSharedOffset{}, m_shared_replication);
-    } else {
+          m_num_bins, m_identity, bin, value, shared_mem, GetSharedOffset {},
+          m_shared_replication);
+    }
+    else
+    {
       impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-          m_num_bins, m_identity,
-          bin, value,
-          m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+          m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset {},
+          m_tally_replication, m_tally_bins);
     }
   }
 
@@ -536,14 +613,16 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
 #if defined(RAJA_ENABLE_OPENMP)
     tally_rep = omp_get_thread_num();
 #endif
-    int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
-    Combiner{}(m_tally_mem[tally_offset], value);
+    int tally_offset =
+        GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    Combiner {}(m_tally_mem[tally_offset], value);
   }
 
 private:
   using shared_tuning = typename tuning::SharedAtomicReplicationTuning;
-  using SharedAtomicReplicationConcretizer = typename shared_tuning::AtomicReplicationConcretizer;
-  using GetSharedIndex = typename shared_tuning::ReplicationIndexer;
+  using SharedAtomicReplicationConcretizer =
+      typename shared_tuning::AtomicReplicationConcretizer;
+  using GetSharedIndex         = typename shared_tuning::ReplicationIndexer;
   using GetSharedOffset_rebind = typename shared_tuning::OffsetCalculator;
   using GetSharedOffset = typename GetSharedOffset_rebind::template rebind<int>;
 
@@ -551,24 +630,27 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   using typename TallyData::GetTallyOffset;
 
 
-  static constexpr int s_shared_offset_unknown = std::numeric_limits<int>::max();
-  static constexpr int s_shared_offset_invalid = std::numeric_limits<int>::max() - 1;
+  static constexpr int s_shared_offset_unknown =
+      std::numeric_limits<int>::max();
+  static constexpr int s_shared_offset_invalid =
+      std::numeric_limits<int>::max() - 1;
 
 
-  using TallyData::m_tally_mem;
   using TallyData::m_identity;
   using TallyData::m_num_bins;
   using TallyData::m_tally_bins;
+  using TallyData::m_tally_mem;
   using TallyData::m_tally_replication;
 
-  int m_shared_offset; // in bytes
-  int m_shared_replication; // power of 2
+  int m_shared_offset;       // in bytes
+  int m_shared_replication;  // power of 2
 
 
   RAJA_DEVICE
   T* get_shared_mem() const
   {
-    if (m_shared_offset == s_shared_offset_invalid) {
+    if (m_shared_offset == s_shared_offset_invalid)
+    {
       return nullptr;
     }
     extern __shared__ char shared_mem[];
@@ -595,39 +677,50 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp, typename tuning >
+template <typename T, typename t_MultiReduceOp, typename tuning>
 struct MultiReduceDataCuda
 {
-  static constexpr bool atomic_available = RAJA::reduce::cuda::cuda_atomic_available<T>::value;
+  static constexpr bool atomic_available =
+      RAJA::reduce::cuda::cuda_atomic_available<T>::value;
 
   //! cuda reduction data storage class and folding algorithm
-  using reduce_data_type =
-      std::conditional_t<(atomic_available),
-        std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic),
-          cuda::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp, T, tuning>,
-          std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_global_atomic),
-            cuda::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp, T, tuning>,
-            void>>,
+  using reduce_data_type = std::conditional_t<
+      (atomic_available),
+      std::conditional_t<
+          (tuning::algorithm ==
+           multi_reduce_algorithm::
+               init_host_combine_block_atomic_then_grid_atomic),
+          cuda::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp,
+                                                            T,
+                                                            tuning>,
+          std::conditional_t<
+              (tuning::algorithm ==
+               multi_reduce_algorithm::init_host_combine_global_atomic),
+              cuda::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp,
+                                                       T,
+                                                       tuning>,
+              void>>,
       void>;
 
 
   using SyncList = std::vector<resources::Cuda>;
 
 public:
-  using value_type = T;
+  using value_type    = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataCuda() = delete;
 
-  template < typename Container,
-             std::enable_if_t<!std::is_same<Container, MultiReduceDataCuda>::value>* = nullptr >
+  template <
+      typename Container,
+      std::enable_if_t<!std::is_same<Container, MultiReduceDataCuda>::value>* =
+          nullptr>
   MultiReduceDataCuda(Container const& container, T identity)
-      : m_parent(this)
-      , m_sync_list(new SyncList)
-      , m_data(container, identity)
-      , m_own_launch_data(false)
-  {
-  }
+      : m_parent(this),
+        m_sync_list(new SyncList),
+        m_data(container, identity),
+        m_own_launch_data(false)
+  {}
 
   //! copy and on host attempt to setup for device
   //  init val_ptr to avoid uninitialized read caused by host copy of
@@ -639,31 +732,35 @@ struct MultiReduceDataCuda
 #else
       : m_parent(&other)
 #endif
-      , m_sync_list(other.m_sync_list)
-      , m_data(other.m_data)
-      , m_own_launch_data(false)
+        ,
+        m_sync_list(other.m_sync_list),
+        m_data(other.m_data),
+        m_own_launch_data(false)
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (m_parent) {
-      if (setupReducers()) {
+    if (m_parent)
+    {
+      if (setupReducers())
+      {
         // the copy made in make_launch_body does this setup
         add_resource_to_synchronization_list(currentResource());
         m_data.setup_launch(currentBlockSize());
         m_own_launch_data = true;
-        m_parent = nullptr;
+        m_parent          = nullptr;
       }
     }
 #else
-    if (!m_parent->m_parent) {
+    if (!m_parent->m_parent)
+    {
       // the first copy on device enters this branch
       m_data.setup_device();
     }
 #endif
   }
 
-  MultiReduceDataCuda(MultiReduceDataCuda &&) = delete;
+  MultiReduceDataCuda(MultiReduceDataCuda&&)                 = delete;
   MultiReduceDataCuda& operator=(MultiReduceDataCuda const&) = delete;
-  MultiReduceDataCuda& operator=(MultiReduceDataCuda &&) = delete;
+  MultiReduceDataCuda& operator=(MultiReduceDataCuda&&)      = delete;
 
   //! cleanup resources owned by this copy
   //  on device store in pinned buffer on host
@@ -671,23 +768,30 @@ struct MultiReduceDataCuda
   ~MultiReduceDataCuda()
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (m_parent == this) {
+    if (m_parent == this)
+    {
       // the original object, owns permanent storage
       synchronize_resources_and_clear_list();
       delete m_sync_list;
       m_sync_list = nullptr;
       m_data.teardown_permanent();
-    } else if (m_parent) {
+    }
+    else if (m_parent)
+    {
       // do nothing
-    } else {
-      if (m_own_launch_data) {
+    }
+    else
+    {
+      if (m_own_launch_data)
+      {
         // the copy made in make_launch_body, owns launch data
         m_data.teardown_launch();
         m_own_launch_data = false;
       }
     }
 #else
-    if (!m_parent->m_parent) {
+    if (!m_parent->m_parent)
+    {
       // the first copy on device, does finalization on the device
       m_data.finalize_device();
     }
@@ -695,7 +799,7 @@ struct MultiReduceDataCuda
   }
 
 
-  template < typename Container >
+  template <typename Container>
   void reset(Container const& container, T identity)
   {
     synchronize_resources_and_clear_list();
@@ -729,15 +833,17 @@ struct MultiReduceDataCuda
 
 
 private:
-  MultiReduceDataCuda const *m_parent;
+  MultiReduceDataCuda const* m_parent;
   SyncList* m_sync_list;
   reduce_data_type m_data;
   bool m_own_launch_data;
 
   void add_resource_to_synchronization_list(resources::Cuda res)
   {
-    for (resources::Cuda& list_res : *m_sync_list) {
-      if (list_res.get_stream() == res.get_stream()) {
+    for (resources::Cuda& list_res : *m_sync_list)
+    {
+      if (list_res.get_stream() == res.get_stream())
+      {
         return;
       }
     }
@@ -746,7 +852,8 @@ struct MultiReduceDataCuda
 
   void synchronize_resources_and_clear_list()
   {
-    for (resources::Cuda& list_res : *m_sync_list) {
+    for (resources::Cuda& list_res : *m_sync_list)
+    {
       ::RAJA::cuda::synchronize(list_res);
     }
     m_sync_list->clear();
@@ -755,7 +862,8 @@ struct MultiReduceDataCuda
 
 }  // end namespace cuda
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::cuda::cuda_multi_reduce_policy, cuda::MultiReduceDataCuda)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::cuda::cuda_multi_reduce_policy,
+                                cuda::MultiReduceDataCuda)
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/cuda/params/kernel_name.hpp b/include/RAJA/policy/cuda/params/kernel_name.hpp
index 4edf645ed3..6411dfe72d 100644
--- a/include/RAJA/policy/cuda/params/kernel_name.hpp
+++ b/include/RAJA/policy/cuda/params/kernel_name.hpp
@@ -7,42 +7,46 @@
 #include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
 #include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
-
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  init(KernelName& kn, const RAJA::cuda::detail::cudaInfo &)
-  {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+// Init
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL>>
+init(KernelName& kn, const RAJA::cuda::detail::cudaInfo&)
+{
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
-    nvtxRangePush(kn.name);
+  nvtxRangePush(kn.name);
 #else
-    RAJA_UNUSED_VAR(kn);
+  RAJA_UNUSED_VAR(kn);
 #endif
-  }
-
-  // Combine
-  template<typename EXEC_POL>
-  RAJA_HOST_DEVICE
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  combine(KernelName&) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  resolve(KernelName&, const RAJA::cuda::detail::cudaInfo &)
-  {
+}
+
+// Combine
+template <typename EXEC_POL>
+RAJA_HOST_DEVICE
+    camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL>>
+    combine(KernelName&)
+{}
+
+// Resolve
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL>>
+resolve(KernelName&, const RAJA::cuda::detail::cudaInfo&)
+{
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
-    nvtxRangePop();
+  nvtxRangePop();
 #endif
-  }
+}
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 #endif
 
-#endif //  NEW_REDUCE_CUDA_REDUCE_HPP
+#endif  //  NEW_REDUCE_CUDA_REDUCE_HPP
diff --git a/include/RAJA/policy/cuda/params/reduce.hpp b/include/RAJA/policy/cuda/params/reduce.hpp
index 6ab3372aaa..09ea7b2582 100644
--- a/include/RAJA/policy/cuda/params/reduce.hpp
+++ b/include/RAJA/policy/cuda/params/reduce.hpp
@@ -10,54 +10,57 @@
 
 #include "RAJA/policy/cuda/policy.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
-
-  // Init
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  init(Reducer<OP, T, VOp>& red, RAJA::cuda::detail::cudaInfo& ci)
-  {
-    red.devicetarget = RAJA::cuda::pinned_mempool_type::getInstance().template malloc<T>(1);
-    red.device_mem.allocate(ci.gridDim.x * ci.gridDim.y * ci.gridDim.z);
-    red.device_count = RAJA::cuda::device_zeroed_mempool_type::getInstance().template malloc<unsigned int>(1);
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  RAJA_HOST_DEVICE
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  combine(Reducer<OP, T, VOp>& red)
-  {
-    RAJA::cuda::impl::expt::grid_reduce<typename EXEC_POL::IterationGetter, OP>(red.devicetarget,
-                                                                            red.getVal(),
-                                                                            red.device_mem,
-                                                                            red.device_count);
-  }
-
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  resolve(Reducer<OP, T, VOp>& red, RAJA::cuda::detail::cudaInfo& ci)
-  {
-    // complete reduction
-    ci.res.wait();
-
-    red.combineTarget(*red.devicetarget);
-
-    // free memory
-    RAJA::cuda::device_zeroed_mempool_type::getInstance().free(red.device_count);
-    red.device_count = nullptr;
-    red.device_mem.deallocate();
-    RAJA::cuda::pinned_mempool_type::getInstance().free(red.devicetarget);
-    red.devicetarget = nullptr;
-  }
-
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+// Init
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL>>
+init(Reducer<OP, T, VOp>& red, RAJA::cuda::detail::cudaInfo& ci)
+{
+  red.devicetarget =
+      RAJA::cuda::pinned_mempool_type::getInstance().template malloc<T>(1);
+  red.device_mem.allocate(ci.gridDim.x * ci.gridDim.y * ci.gridDim.z);
+  red.device_count = RAJA::cuda::device_zeroed_mempool_type::getInstance()
+                         .template malloc<unsigned int>(1);
+}
+
+// Combine
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+RAJA_HOST_DEVICE
+    camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL>>
+    combine(Reducer<OP, T, VOp>& red)
+{
+  RAJA::cuda::impl::expt::grid_reduce<typename EXEC_POL::IterationGetter, OP>(
+      red.devicetarget, red.getVal(), red.device_mem, red.device_count);
+}
+
+// Resolve
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL>>
+resolve(Reducer<OP, T, VOp>& red, RAJA::cuda::detail::cudaInfo& ci)
+{
+  // complete reduction
+  ci.res.wait();
+
+  red.combineTarget(*red.devicetarget);
+
+  // free memory
+  RAJA::cuda::device_zeroed_mempool_type::getInstance().free(red.device_count);
+  red.device_count = nullptr;
+  red.device_mem.deallocate();
+  RAJA::cuda::pinned_mempool_type::getInstance().free(red.devicetarget);
+  red.devicetarget = nullptr;
+}
+
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 #endif
 
-#endif //  NEW_REDUCE_CUDA_REDUCE_HPP
+#endif  //  NEW_REDUCE_CUDA_REDUCE_HPP
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index cd71a37480..a7bac49fd9 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -61,12 +61,14 @@ using cuda_dim_member_t = camp::decay<decltype(std::declval<cuda_dim_t>().x)>;
 namespace detail
 {
 template <bool Async>
-struct get_launch {
+struct get_launch
+{
   static constexpr RAJA::Launch value = RAJA::Launch::async;
 };
 
 template <>
-struct get_launch<false> {
+struct get_launch<false>
+{
   static constexpr RAJA::Launch value = RAJA::Launch::sync;
 };
 }  // end namespace detail
@@ -75,16 +77,16 @@ namespace cuda
 {
 
 /// Type representing thread and block indexing within a grid
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct IndexGlobal;
 
-template<typename ...indexers>
+template <typename... indexers>
 struct IndexFlatten;
 
-template<size_t divisor, typename index>
+template <size_t divisor, typename index>
 struct IndexDivide;
 
-template<size_t divisor, typename index>
+template <size_t divisor, typename index>
 struct IndexModulo;
 
 
@@ -96,13 +98,14 @@ struct IndexModulo;
  */
 struct MaxOccupancyConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
-    IdxT device_sm_per_device = data.device_sm_per_device;
+    IdxT device_sm_per_device   = data.device_sm_per_device;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
 
-    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
+    IdxT func_max_blocks_per_device =
+        func_max_blocks_per_sm * device_sm_per_device;
 
     return func_max_blocks_per_device;
   }
@@ -115,26 +118,31 @@ struct MaxOccupancyConcretizer
  * maximum grid size:
  * (Fraction * kernel_max_blocks_per_sm + BLOCKS_PER_SM_OFFSET) * device_sm
  */
-template < typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
+template <typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
 struct FractionOffsetOccupancyConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
     using Fraction = typename t_Fraction::template rebind<IdxT>;
 
-    IdxT device_sm_per_device = data.device_sm_per_device;
+    IdxT device_sm_per_device   = data.device_sm_per_device;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
 
-    if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0)) {
+    if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0))
+    {
       func_max_blocks_per_sm = Fraction::multiply(func_max_blocks_per_sm);
     }
 
-    if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) > IdxT(0)) {
-      func_max_blocks_per_sm = IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
+    if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) >
+        IdxT(0))
+    {
+      func_max_blocks_per_sm =
+          IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
     }
 
-    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
+    IdxT func_max_blocks_per_device =
+        func_max_blocks_per_sm * device_sm_per_device;
 
     return func_max_blocks_per_device;
   }
@@ -148,22 +156,27 @@ struct FractionOffsetOccupancyConcretizer
  * Otherwise use the given AvoidMaxOccupancyCalculator to determine the
  * maximum grid size.
  */
-template < typename AvoidMaxOccupancyConcretizer >
+template <typename AvoidMaxOccupancyConcretizer>
 struct AvoidDeviceMaxThreadOccupancyConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
     IdxT device_max_threads_per_sm = data.device_max_threads_per_sm;
-    IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
-    IdxT func_threads_per_block = data.func_threads_per_block;
+    IdxT func_max_blocks_per_sm    = data.func_max_blocks_per_sm;
+    IdxT func_threads_per_block    = data.func_threads_per_block;
 
-    IdxT func_max_threads_per_sm = func_threads_per_block * func_max_blocks_per_sm;
+    IdxT func_max_threads_per_sm =
+        func_threads_per_block * func_max_blocks_per_sm;
 
-    if (func_max_threads_per_sm < device_max_threads_per_sm) {
+    if (func_max_threads_per_sm < device_max_threads_per_sm)
+    {
       return MaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
-    } else {
-      return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
+    }
+    else
+    {
+      return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(
+          data);
     }
   }
 };
@@ -172,10 +185,10 @@ struct AvoidDeviceMaxThreadOccupancyConcretizer
 /*!
  * Get an amount of replication that is preferred_replication.
  */
-template < size_t preferred_replication >
+template <size_t preferred_replication>
 struct ConstantPreferredReplicationConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_preferred_replication(Data const& RAJA_UNUSED_ARG(data))
   {
     return IdxT(preferred_replication);
@@ -187,19 +200,23 @@ struct ConstantPreferredReplicationConcretizer
  * data.func_threads_per_block is less than t_cutoff or
  * preferred_replication_after_cutoff otherwise.
  */
-template < size_t t_cutoff, size_t preferred_replication_before_cutoff,
-                            size_t preferred_replication_after_cutoff >
+template <size_t t_cutoff,
+          size_t preferred_replication_before_cutoff,
+          size_t preferred_replication_after_cutoff>
 struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_preferred_replication(Data const& data)
   {
-    IdxT cutoff = t_cutoff;
+    IdxT cutoff                 = t_cutoff;
     IdxT func_threads_per_block = data.func_threads_per_block;
 
-    if (func_threads_per_block < cutoff) {
+    if (func_threads_per_block < cutoff)
+    {
       return IdxT(preferred_replication_before_cutoff);
-    } else {
+    }
+    else
+    {
       return IdxT(preferred_replication_after_cutoff);
     }
   }
@@ -210,19 +227,21 @@ struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
  * most the amount given by data.func_max_shared_replication_per_block or the
  * amount given by GetPreferredReplication.
  */
-template < typename GetPreferredReplication >
+template <typename GetPreferredReplication>
 struct SharedAtomicReplicationMaxPow2Concretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_shared_replication(Data const& data)
   {
-    IdxT func_max_shared_replication_per_block = data.func_max_shared_replication_per_block;
+    IdxT func_max_shared_replication_per_block =
+        data.func_max_shared_replication_per_block;
 
-    IdxT preferred_replication = GetPreferredReplication{}.template
-        get_preferred_replication<IdxT>(data);
+    IdxT preferred_replication =
+        GetPreferredReplication {}.template get_preferred_replication<IdxT>(
+            data);
 
-    return prev_pow2(std::min(preferred_replication,
-                              func_max_shared_replication_per_block));
+    return prev_pow2(
+        std::min(preferred_replication, func_max_shared_replication_per_block));
   }
 };
 
@@ -231,18 +250,20 @@ struct SharedAtomicReplicationMaxPow2Concretizer
  * least the amount given by data.func_min_global_replication or the
  * amount given by GetPreferredReplication.
  */
-template < typename GetPreferredReplication >
+template <typename GetPreferredReplication>
 struct GlobalAtomicReplicationMinPow2Concretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_global_replication(Data const& data)
   {
     IdxT func_min_global_replication = data.func_min_global_replication;
 
-    IdxT preferred_replication = GetPreferredReplication{}.template
-        get_preferred_replication<IdxT>(data);
+    IdxT preferred_replication =
+        GetPreferredReplication {}.template get_preferred_replication<IdxT>(
+            data);
 
-    return next_pow2(std::max(preferred_replication, func_min_global_replication));
+    return next_pow2(
+        std::max(preferred_replication, func_min_global_replication));
   }
 };
 
@@ -260,14 +281,16 @@ enum struct block_communication_mode : int
   block_fence
 };
 
-template < reduce_algorithm t_algorithm, block_communication_mode t_comm_mode,
-           size_t t_replication, size_t t_atomic_stride >
+template <reduce_algorithm t_algorithm,
+          block_communication_mode t_comm_mode,
+          size_t t_replication,
+          size_t t_atomic_stride>
 struct ReduceTuning
 {
-  static constexpr reduce_algorithm algorithm = t_algorithm;
+  static constexpr reduce_algorithm algorithm         = t_algorithm;
   static constexpr block_communication_mode comm_mode = t_comm_mode;
-  static constexpr size_t replication = t_replication;
-  static constexpr size_t atomic_stride = t_atomic_stride;
+  static constexpr size_t replication                 = t_replication;
+  static constexpr size_t atomic_stride               = t_atomic_stride;
   static constexpr bool consistent =
       (algorithm == reduce_algorithm::combine_last_block);
 };
@@ -279,25 +302,25 @@ enum struct multi_reduce_algorithm : int
   init_host_combine_global_atomic
 };
 
-template < typename t_AtomicReplicationConcretizer,
-           typename t_ReplicationIndexer,
-           typename t_OffsetCalculator >
+template <typename t_AtomicReplicationConcretizer,
+          typename t_ReplicationIndexer,
+          typename t_OffsetCalculator>
 struct AtomicReplicationTuning
 {
   using AtomicReplicationConcretizer = t_AtomicReplicationConcretizer;
-  using ReplicationIndexer = t_ReplicationIndexer;
-  using OffsetCalculator = t_OffsetCalculator;
+  using ReplicationIndexer           = t_ReplicationIndexer;
+  using OffsetCalculator             = t_OffsetCalculator;
 };
 
-template < multi_reduce_algorithm t_algorithm,
-           typename t_SharedAtomicReplicationTuning,
-           typename t_GlobalAtomicReplicationTuning >
+template <multi_reduce_algorithm t_algorithm,
+          typename t_SharedAtomicReplicationTuning,
+          typename t_GlobalAtomicReplicationTuning>
 struct MultiReduceTuning
 {
   static constexpr multi_reduce_algorithm algorithm = t_algorithm;
   using SharedAtomicReplicationTuning = t_SharedAtomicReplicationTuning;
   using GlobalAtomicReplicationTuning = t_GlobalAtomicReplicationTuning;
-  static constexpr bool consistent = false;
+  static constexpr bool consistent    = false;
 };
 
 }  // namespace cuda
@@ -312,25 +335,29 @@ struct DeviceConstants
   RAJA::Index_type WARP_SIZE;
   RAJA::Index_type MAX_BLOCK_SIZE;
   RAJA::Index_type MAX_WARPS;
-  RAJA::Index_type ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE; // basically the cache line size of the cache level that handles atomics
+  RAJA::Index_type
+      ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE;  // basically the cache line size of
+                                             // the cache level that handles
+                                             // atomics
 
   constexpr DeviceConstants(RAJA::Index_type warp_size,
                             RAJA::Index_type max_block_size,
                             RAJA::Index_type atomic_cache_line_bytes) noexcept
-    : WARP_SIZE(warp_size)
-    , MAX_BLOCK_SIZE(max_block_size)
-    , MAX_WARPS(max_block_size / warp_size)
-    , ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
-  { }
+      : WARP_SIZE(warp_size),
+        MAX_BLOCK_SIZE(max_block_size),
+        MAX_WARPS(max_block_size / warp_size),
+        ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
+  {}
 };
 
 //
 // Operations in the included files are parametrized using the following
 // values for CUDA warp size and max block size.
 //
-constexpr DeviceConstants device_constants(32, 1024, 32); // V100
+constexpr DeviceConstants device_constants(32, 1024, 32);  // V100
 static_assert(device_constants.WARP_SIZE >= device_constants.MAX_WARPS,
-              "RAJA Assumption Broken: device_constants.WARP_SIZE < device_constants.MAX_WARPS");
+              "RAJA Assumption Broken: device_constants.WARP_SIZE < "
+              "device_constants.MAX_WARPS");
 static_assert(device_constants.MAX_BLOCK_SIZE % device_constants.WARP_SIZE == 0,
               "RAJA Assumption Broken: device_constants.MAX_BLOCK_SIZE not "
               "a multiple of device_constants.WARP_SIZE");
@@ -339,38 +366,51 @@ constexpr const size_t MIN_BLOCKS_PER_SM = 1;
 constexpr const size_t MAX_BLOCKS_PER_SM = 32;
 
 
-template <typename _IterationMapping, kernel_sync_requirement sync, typename ... _IterationGetters>
-struct cuda_indexer {};
-
-template <typename _IterationMapping, kernel_sync_requirement sync, typename ... _IterationGetters>
-struct cuda_flatten_indexer : public RAJA::make_policy_pattern_launch_platform_t<
-  RAJA::Policy::cuda,
-  RAJA::Pattern::region,
-  detail::get_launch<true /*async */>::value,
-  RAJA::Platform::cuda> {
+template <typename _IterationMapping,
+          kernel_sync_requirement sync,
+          typename... _IterationGetters>
+struct cuda_indexer
+{};
+
+template <typename _IterationMapping,
+          kernel_sync_requirement sync,
+          typename... _IterationGetters>
+struct cuda_flatten_indexer
+    : public RAJA::make_policy_pattern_launch_platform_t<
+          RAJA::Policy::cuda,
+          RAJA::Pattern::region,
+          detail::get_launch<true /*async */>::value,
+          RAJA::Platform::cuda>
+{
   using IterationGetter = RAJA::cuda::IndexFlatten<_IterationGetters...>;
 };
 
-template <typename _IterationMapping, typename _IterationGetter, typename _LaunchConcretizer,
-          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
+template <typename _IterationMapping,
+          typename _IterationGetter,
+          typename _LaunchConcretizer,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
+          bool Async           = false>
 struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::cuda,
-                       RAJA::Pattern::forall,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::cuda> {
-  using IterationMapping = _IterationMapping;
-  using IterationGetter = _IterationGetter;
+                                RAJA::Policy::cuda,
+                                RAJA::Pattern::forall,
+                                detail::get_launch<Async>::value,
+                                RAJA::Platform::cuda>
+{
+  using IterationMapping  = _IterationMapping;
+  using IterationGetter   = _IterationGetter;
   using LaunchConcretizer = _LaunchConcretizer;
 };
 
-template <bool Async, int num_threads = named_usage::unspecified,
+template <bool Async,
+          int num_threads      = named_usage::unspecified,
           size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
-struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_t<
-                                RAJA::Policy::cuda,
-                                RAJA::Pattern::region,
-                                detail::get_launch<Async>::value,
-                                RAJA::Platform::cuda> {
-};
+struct cuda_launch_explicit_t
+    : public RAJA::make_policy_pattern_launch_platform_t<
+          RAJA::Policy::cuda,
+          RAJA::Pattern::region,
+          detail::get_launch<Async>::value,
+          RAJA::Platform::cuda>
+{};
 
 
 //
@@ -380,13 +420,15 @@ struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform
 ///
 /// WorkGroup execution policies
 ///
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
+          bool Async           = false>
 struct cuda_work_explicit : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::cuda,
-                       RAJA::Pattern::workgroup_exec,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::cuda> {
-};
+                                RAJA::Policy::cuda,
+                                RAJA::Pattern::workgroup_exec,
+                                detail::get_launch<Async>::value,
+                                RAJA::Platform::cuda>
+{};
 
 /// execute the enqueued loops in an unordered fashion by mapping loops to
 /// blocks in the y direction and loop iterations to threads in the x direction
@@ -394,10 +436,10 @@ struct cuda_work_explicit : public RAJA::make_policy_pattern_launch_platform_t<
 /// of all the loops
 struct unordered_cuda_loop_y_block_iter_x_threadblock_average
     : public RAJA::make_policy_pattern_platform_t<
-                       RAJA::Policy::cuda,
-                       RAJA::Pattern::workgroup_order,
-                       RAJA::Platform::cuda> {
-};
+          RAJA::Policy::cuda,
+          RAJA::Pattern::workgroup_order,
+          RAJA::Platform::cuda>
+{};
 
 
 ///
@@ -408,36 +450,36 @@ struct unordered_cuda_loop_y_block_iter_x_threadblock_average
 ///////////////////////////////////////////////////////////////////////
 ///
 
-template < typename tuning >
-struct cuda_reduce_policy
-    : public RAJA::
-          make_policy_pattern_launch_platform_t<RAJA::Policy::cuda,
-                                                RAJA::Pattern::reduce,
-                                                detail::get_launch<false>::value,
-                                                RAJA::Platform::cuda,
-                                                std::conditional_t<tuning::consistent,
-                                                                   reduce::ordered,
-                                                                   reduce::unordered>> {
-};
-
-template < typename tuning >
+template <typename tuning>
+struct cuda_reduce_policy : public RAJA::make_policy_pattern_launch_platform_t<
+                                RAJA::Policy::cuda,
+                                RAJA::Pattern::reduce,
+                                detail::get_launch<false>::value,
+                                RAJA::Platform::cuda,
+                                std::conditional_t<tuning::consistent,
+                                                   reduce::ordered,
+                                                   reduce::unordered>>
+{};
+
+template <typename tuning>
 struct cuda_multi_reduce_policy
-    : public RAJA::
-          make_policy_pattern_launch_platform_t<RAJA::Policy::cuda,
-                                                RAJA::Pattern::multi_reduce,
-                                                detail::get_launch<false>::value,
-                                                RAJA::Platform::cuda,
-                                                std::conditional_t<tuning::consistent,
-                                                                   reduce::ordered,
-                                                                   reduce::unordered>> {
-};
+    : public RAJA::make_policy_pattern_launch_platform_t<
+          RAJA::Policy::cuda,
+          RAJA::Pattern::multi_reduce,
+          detail::get_launch<false>::value,
+          RAJA::Platform::cuda,
+          std::conditional_t<tuning::consistent,
+                             reduce::ordered,
+                             reduce::unordered>>
+{};
 
 /*!
  * Cuda atomic policy for using cuda atomics on the device and
  * the provided policy on the host
  */
-template<typename host_policy>
-struct cuda_atomic_explicit{};
+template <typename host_policy>
+struct cuda_atomic_explicit
+{};
 
 /*!
  * Default cuda atomic policy uses cuda atomics on the device and non-atomics
@@ -448,23 +490,26 @@ using cuda_atomic = cuda_atomic_explicit<seq_atomic>;
 
 // Policy for RAJA::statement::Reduce that reduces threads in a block
 // down to threadIdx 0
-struct cuda_block_reduce{};
+struct cuda_block_reduce
+{};
 
 // Policy for RAJA::statement::Reduce that reduces threads in a warp
 // down to the first lane of the warp
-struct cuda_warp_reduce{};
+struct cuda_warp_reduce
+{};
 
 // Policy to map work directly to threads within a warp
 // Maximum iteration count is WARP_SIZE
 // Cannot be used in conjunction with cuda_thread_x_*
 // Multiple warps have to be created by using cuda_thread_{yz}_*
-struct cuda_warp_direct{};
+struct cuda_warp_direct
+{};
 
 // Policy to map work to threads within a warp using a warp-stride loop
 // Cannot be used in conjunction with cuda_thread_x_*
 // Multiple warps have to be created by using cuda_thread_{yz}_*
-struct cuda_warp_loop{};
-
+struct cuda_warp_loop
+{};
 
 
 // Policy to map work to threads within a warp using a bit mask
@@ -473,8 +518,9 @@ struct cuda_warp_loop{};
 // Since we are masking specific threads, multiple nested
 // cuda_warp_masked
 // can be used to create complex thread interleaving patterns
-template<typename Mask>
-struct cuda_warp_masked_direct {};
+template <typename Mask>
+struct cuda_warp_masked_direct
+{};
 
 // Policy to map work to threads within a warp using a bit mask
 // Cannot be used in conjunction with cuda_thread_x_*
@@ -482,21 +528,24 @@ struct cuda_warp_masked_direct {};
 // Since we are masking specific threads, multiple nested
 // cuda_warp_masked
 // can be used to create complex thread interleaving patterns
-template<typename Mask>
-struct cuda_warp_masked_loop {};
+template <typename Mask>
+struct cuda_warp_masked_loop
+{};
 
 
-template<typename Mask>
-struct cuda_thread_masked_direct {};
+template <typename Mask>
+struct cuda_thread_masked_direct
+{};
 
-template<typename Mask>
-struct cuda_thread_masked_loop {};
+template <typename Mask>
+struct cuda_thread_masked_loop
+{};
 
 
 struct cuda_synchronize : make_policy_pattern_launch_t<Policy::cuda,
                                                        Pattern::synchronize,
-                                                       Launch::sync> {
-};
+                                                       Launch::sync>
+{};
 
 }  // end namespace cuda
 }  // end namespace policy
@@ -508,141 +557,131 @@ namespace internal
 RAJA_INLINE
 int get_size(cuda_dim_t dims)
 {
-  if(dims.x == 0 && dims.y == 0 && dims.z == 0){
+  if (dims.x == 0 && dims.y == 0 && dims.z == 0)
+  {
     return 0;
   }
-  return (dims.x ? dims.x : 1) *
-         (dims.y ? dims.y : 1) *
-         (dims.z ? dims.z : 1);
+  return (dims.x ? dims.x : 1) * (dims.y ? dims.y : 1) * (dims.z ? dims.z : 1);
 }
 
-struct CudaDims {
+struct CudaDims
+{
 
-  cuda_dim_t blocks{0,0,0};
-  cuda_dim_t threads{0,0,0};
+  cuda_dim_t blocks {0, 0, 0};
+  cuda_dim_t threads {0, 0, 0};
 
-  CudaDims() = default;
-  CudaDims(CudaDims const&) = default;
+  CudaDims()                           = default;
+  CudaDims(CudaDims const&)            = default;
   CudaDims& operator=(CudaDims const&) = default;
 
   RAJA_INLINE
   CudaDims(cuda_dim_member_t default_val)
-    : blocks{default_val, default_val, default_val}
-    , threads{default_val, default_val, default_val}
-  { }
+      : blocks {default_val, default_val, default_val},
+        threads {default_val, default_val, default_val}
+  {}
 
   RAJA_INLINE
-  int num_blocks() const {
-    return get_size(blocks);
-  }
+  int num_blocks() const { return get_size(blocks); }
 
   RAJA_INLINE
-  int num_threads() const {
-    return get_size(threads);
-  }
+  int num_threads() const { return get_size(threads); }
 
   RAJA_INLINE
-  cuda_dim_t get_blocks() const {
-    if (num_blocks() != 0) {
-      return {(blocks.x ? blocks.x : 1),
-              (blocks.y ? blocks.y : 1),
+  cuda_dim_t get_blocks() const
+  {
+    if (num_blocks() != 0)
+    {
+      return {(blocks.x ? blocks.x : 1), (blocks.y ? blocks.y : 1),
               (blocks.z ? blocks.z : 1)};
-    } else {
+    }
+    else
+    {
       return blocks;
     }
   }
 
   RAJA_INLINE
-  cuda_dim_t get_threads() const {
-    if (num_threads() != 0) {
-      return {(threads.x ? threads.x : 1),
-              (threads.y ? threads.y : 1),
+  cuda_dim_t get_threads() const
+  {
+    if (num_threads() != 0)
+    {
+      return {(threads.x ? threads.x : 1), (threads.y ? threads.y : 1),
               (threads.z ? threads.z : 1)};
-    } else {
+    }
+    else
+    {
       return threads;
     }
   }
 };
 
-template<named_dim dim>
+template <named_dim dim>
 struct CudaDimHelper;
 
-template<>
-struct CudaDimHelper<named_dim::x>{
+template <>
+struct CudaDimHelper<named_dim::x>
+{
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  cuda_dim_member_t get(dim_t const &d)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static constexpr cuda_dim_member_t get(dim_t const& d)
   {
     return d.x;
   }
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, cuda_dim_member_t value)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, cuda_dim_member_t value)
   {
     d.x = value;
   }
 };
 
-template<>
-struct CudaDimHelper<named_dim::y>{
+template <>
+struct CudaDimHelper<named_dim::y>
+{
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  cuda_dim_member_t get(dim_t const &d)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static constexpr cuda_dim_member_t get(dim_t const& d)
   {
     return d.y;
   }
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, cuda_dim_member_t value)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, cuda_dim_member_t value)
   {
     d.y = value;
   }
 };
 
-template<>
-struct CudaDimHelper<named_dim::z>{
+template <>
+struct CudaDimHelper<named_dim::z>
+{
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  cuda_dim_member_t get(dim_t const &d)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static constexpr cuda_dim_member_t get(dim_t const& d)
   {
     return d.z;
   }
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, cuda_dim_member_t value)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, cuda_dim_member_t value)
   {
     d.z = value;
   }
 };
 
-template<named_dim dim, typename dim_t>
-RAJA_HOST_DEVICE
-constexpr
-cuda_dim_member_t get_cuda_dim(dim_t const &d)
+template <named_dim dim, typename dim_t>
+RAJA_HOST_DEVICE constexpr cuda_dim_member_t get_cuda_dim(dim_t const& d)
 {
   return CudaDimHelper<dim>::get(d);
 }
 
-template<named_dim dim, typename dim_t>
-RAJA_HOST_DEVICE
-void set_cuda_dim(dim_t &d, cuda_dim_member_t value)
+template <named_dim dim, typename dim_t>
+RAJA_HOST_DEVICE void set_cuda_dim(dim_t& d, cuda_dim_member_t value)
 {
   return CudaDimHelper<dim>::set(d, value);
 }
 
-} // namespace internal
+}  // namespace internal
 
 namespace cuda
 {
@@ -651,14 +690,13 @@ namespace cuda
 struct IndexSize
 {
   cuda_dim_member_t block_size = named_usage::unspecified;
-  cuda_dim_member_t grid_size = named_usage::unspecified;
-
-  RAJA_HOST_DEVICE constexpr
-  IndexSize(cuda_dim_member_t _block_size = named_usage::unspecified,
-            cuda_dim_member_t _grid_size = named_usage::unspecified)
-    : block_size(_block_size)
-    , grid_size(_grid_size)
-  { }
+  cuda_dim_member_t grid_size  = named_usage::unspecified;
+
+  RAJA_HOST_DEVICE constexpr IndexSize(
+      cuda_dim_member_t _block_size = named_usage::unspecified,
+      cuda_dim_member_t _grid_size  = named_usage::unspecified)
+      : block_size(_block_size), grid_size(_grid_size)
+  {}
 };
 
 /// Type representing thread indexing within a grid
@@ -666,436 +704,461 @@ struct IndexSize
 
 /// useful for global indexing
 /// with fixed block size and fixed grid size
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct IndexGlobal
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
            static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+               static_cast<IdxT>(
+                   ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(block_size) * static_cast<IdxT>(grid_size);
   }
 };
 /// with fixed block size of 1 and fixed grid size
-template<named_dim dim, int GRID_SIZE>
+template <named_dim dim, int GRID_SIZE>
 struct IndexGlobal<dim, 1, GRID_SIZE>
 {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = 1;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(grid_size);
   }
 };
 /// with fixed block size and fixed grid size of 1
-template<named_dim dim, int BLOCK_SIZE>
+template <named_dim dim, int BLOCK_SIZE>
 struct IndexGlobal<dim, BLOCK_SIZE, 1>
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) ;
+    return static_cast<IdxT>(block_size);
   }
 };
 /// with fixed block size and fixed grid size of 1
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, 1, 1>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 
 /// with dynamic block size and fixed grid size
-template<named_dim dim, int GRID_SIZE>
+template <named_dim dim, int GRID_SIZE>
 struct IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>
 {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
+           static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
+               static_cast<IdxT>(
+                   ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
+           static_cast<IdxT>(grid_size);
   }
 };
 /// with dynamic block size and fixed grid size of 1
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, 1>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockDim));
   }
 };
 
 /// with fixed block size and dynamic grid size
-template<named_dim dim, int BLOCK_SIZE>
+template <named_dim dim, int BLOCK_SIZE>
 struct IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
            static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+               static_cast<IdxT>(
+                   ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
     return static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(gridDim)) ;
+           static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(gridDim));
   }
 };
 /// with fixed block size of 1 and dynamic grid size
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, 1, named_usage::unspecified>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(gridDim));
   }
 };
 
 /// with dynamic block size and dynamic grid size
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
+           static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
+               static_cast<IdxT>(
+                   ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
+           static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(gridDim));
   }
 };
 
 /// useful for indexing blocks (ignores thread indices)
 /// with fixed grid size
-template<named_dim dim, int GRID_SIZE>
+template <named_dim dim, int GRID_SIZE>
 struct IndexGlobal<dim, named_usage::ignored, GRID_SIZE>
 {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(grid_size);
   }
 };
 /// with fixed grid sized of 1
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, 1>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 /// with dynamic grid size
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(gridDim));
   }
 };
 
 /// useful for indexing threads (ignores block indices)
 /// with fixed block size
-template<named_dim dim, int BLOCK_SIZE>
+template <named_dim dim, int BLOCK_SIZE>
 struct IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) ;
+    return static_cast<IdxT>(block_size);
   }
 };
 /// with fixed block size of 1
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, 1, named_usage::ignored>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 /// with dynamic block size
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockDim));
   }
 };
 
 /// useful for doing single threaded sequential tasks
 /// (ignores thread and block indices)
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, named_usage::ignored>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 
 // useful for flatten global index (includes x)
-template<typename x_index>
+template <typename x_index>
 struct IndexFlatten<x_index>
 {
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>();
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>();
+    return x_index::template size<IdxT>();
   }
-
 };
 
 // useful for flatten global index (includes x,y)
-template<typename x_index, typename y_index>
+template <typename x_index, typename y_index>
 struct IndexFlatten<x_index, y_index>
 {
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>() +
-      x_index::template size<IdxT>() * ( y_index::template index<IdxT>());
-
+           x_index::template size<IdxT>() * (y_index::template index<IdxT>());
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>() * y_index::template size<IdxT> ();
+    return x_index::template size<IdxT>() * y_index::template size<IdxT>();
   }
-
 };
 
 // useful for flatten global index (includes x,y,z)
-template<typename x_index, typename y_index, typename z_index>
+template <typename x_index, typename y_index, typename z_index>
 struct IndexFlatten<x_index, y_index, z_index>
 {
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>() +
-      x_index::template size<IdxT>() * ( y_index::template index<IdxT>() +
-                                         y_index::template size<IdxT>() * z_index::template index<IdxT>());
+           x_index::template size<IdxT>() *
+               (y_index::template index<IdxT>() +
+                y_index::template size<IdxT>() *
+                    z_index::template index<IdxT>());
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>() * y_index::template size<IdxT> () * z_index::template size<IdxT> ();
+    return x_index::template size<IdxT>() * y_index::template size<IdxT>() *
+           z_index::template size<IdxT>();
   }
-
 };
 
-template<size_t divisor, typename indexer>
+template <size_t divisor, typename indexer>
 struct IndexDivide
 {
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
     return indexer::template index<IdxT>() / static_cast<IdxT>(divisor);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(), static_cast<IdxT>(divisor));
+    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(),
+                                   static_cast<IdxT>(divisor));
   }
 };
 
-template<size_t divisor, typename indexer>
+template <size_t divisor, typename indexer>
 struct IndexModulo
 {
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
     return indexer::template index<IdxT>() % static_cast<IdxT>(divisor);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
     return static_cast<IdxT>(divisor);
@@ -1104,10 +1167,10 @@ struct IndexModulo
 
 
 // helper to get just the thread indexing part of IndexGlobal
-template < typename index_global >
+template <typename index_global>
 struct get_index_thread;
 ///
-template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct get_index_thread<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
 {
   using type = IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
@@ -1122,10 +1185,10 @@ struct get_index_thread<IndexFlatten<x_index, y_index, z_index>>
 };
 
 // helper to get just the block indexing part of IndexGlobal
-template < typename index_global >
+template <typename index_global>
 struct get_index_block;
 ///
-template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct get_index_block<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
 {
   using type = IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
@@ -1140,85 +1203,83 @@ struct get_index_block<IndexFlatten<x_index, y_index, z_index>>
 };
 
 
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_x = IndexGlobal<named_dim::x, BLOCK_SIZE, named_usage::ignored>;
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_y = IndexGlobal<named_dim::y, BLOCK_SIZE, named_usage::ignored>;
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_z = IndexGlobal<named_dim::z, BLOCK_SIZE, named_usage::ignored>;
 
-template <size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified>
+template <size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified>
 using thread_xyz = IndexFlatten<thread_x<BLOCK_SIZE_X>,
                                 thread_y<BLOCK_SIZE_Y>,
                                 thread_z<BLOCK_SIZE_Z>>;
 
-template <size_t GRID_SIZE=named_usage::unspecified>
+template <size_t GRID_SIZE = named_usage::unspecified>
 using block_x = IndexGlobal<named_dim::x, named_usage::ignored, GRID_SIZE>;
-template <size_t GRID_SIZE=named_usage::unspecified>
+template <size_t GRID_SIZE = named_usage::unspecified>
 using block_y = IndexGlobal<named_dim::y, named_usage::ignored, GRID_SIZE>;
-template <size_t GRID_SIZE=named_usage::unspecified>
+template <size_t GRID_SIZE = named_usage::unspecified>
 using block_z = IndexGlobal<named_dim::z, named_usage::ignored, GRID_SIZE>;
 
-template <size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
+template <size_t GRID_SIZE_X = named_usage::unspecified,
+          size_t GRID_SIZE_Y = named_usage::unspecified,
+          size_t GRID_SIZE_Z = named_usage::unspecified>
 using block_xyz = IndexFlatten<block_x<GRID_SIZE_X>,
                                block_y<GRID_SIZE_Y>,
                                block_z<GRID_SIZE_Z>>;
 
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_x = IndexGlobal<named_dim::x, BLOCK_SIZE, GRID_SIZE>;
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_y = IndexGlobal<named_dim::y, BLOCK_SIZE, GRID_SIZE>;
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 
 
 template <size_t BLOCK_SIZE_X,
           size_t BLOCK_SIZE_Y,
           size_t BLOCK_SIZE_Z,
-          size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
+          size_t GRID_SIZE_X = named_usage::unspecified,
+          size_t GRID_SIZE_Y = named_usage::unspecified,
+          size_t GRID_SIZE_Z = named_usage::unspecified>
 using global_xyz = IndexFlatten<global_x<BLOCK_SIZE_X, GRID_SIZE_X>,
                                 global_y<BLOCK_SIZE_Y, GRID_SIZE_Y>,
                                 global_z<BLOCK_SIZE_Z, GRID_SIZE_Z>>;
 
 
-template <size_t WARP_SIZE=RAJA::policy::cuda::device_constants.WARP_SIZE,
-          size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified>
-using warp_xyz = IndexDivide<WARP_SIZE,
-                             thread_xyz<BLOCK_SIZE_X,
-                                        BLOCK_SIZE_Y,
-                                        BLOCK_SIZE_Z>>;
-
-template <size_t WARP_SIZE=RAJA::policy::cuda::device_constants.WARP_SIZE,
-          size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified,
-          size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
-using warp_global_xyz = IndexFlatten<warp_xyz<WARP_SIZE,
-                                              BLOCK_SIZE_X,
-                                              BLOCK_SIZE_Y,
-                                              BLOCK_SIZE_Z>,
-                                     block_xyz<GRID_SIZE_X,
-                                               GRID_SIZE_Y,
-                                               GRID_SIZE_Z>>;
-
-} // namespace cuda
+template <size_t WARP_SIZE    = RAJA::policy::cuda::device_constants.WARP_SIZE,
+          size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified>
+using warp_xyz =
+    IndexDivide<WARP_SIZE,
+                thread_xyz<BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>>;
+
+template <size_t WARP_SIZE    = RAJA::policy::cuda::device_constants.WARP_SIZE,
+          size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified,
+          size_t GRID_SIZE_X  = named_usage::unspecified,
+          size_t GRID_SIZE_Y  = named_usage::unspecified,
+          size_t GRID_SIZE_Z  = named_usage::unspecified>
+using warp_global_xyz =
+    IndexFlatten<warp_xyz<WARP_SIZE, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>,
+                 block_xyz<GRID_SIZE_X, GRID_SIZE_Y, GRID_SIZE_Z>>;
+
+}  // namespace cuda
 
 // contretizers used in forall, scan, and sort policies
 
-using CudaAvoidDeviceMaxThreadOccupancyConcretizer = cuda::AvoidDeviceMaxThreadOccupancyConcretizer<cuda::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
+using CudaAvoidDeviceMaxThreadOccupancyConcretizer =
+    cuda::AvoidDeviceMaxThreadOccupancyConcretizer<
+        cuda::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
 
-template < typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
-using CudaFractionOffsetOccupancyConcretizer = cuda::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
+template <typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
+using CudaFractionOffsetOccupancyConcretizer =
+    cuda::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
 
 using CudaMaxOccupancyConcretizer = cuda::MaxOccupancyConcretizer;
 
@@ -1228,179 +1289,286 @@ using CudaDefaultConcretizer = CudaMaxOccupancyConcretizer;
 
 // policies usable with forall, scan, and sort
 
-template <size_t BLOCK_SIZE, size_t GRID_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
+template <size_t BLOCK_SIZE,
+          size_t GRID_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async = false>
 using cuda_exec_grid_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer,
+    BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_grid_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer,
+    BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, bool Async = false>
 using cuda_exec_grid = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE>
 using cuda_exec_grid_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
-using cuda_exec_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, Async>;
+using cuda_exec_explicit =
+    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
+                                     cuda::global_x<BLOCK_SIZE>,
+                                     CudaDefaultConcretizer,
+                                     BLOCKS_PER_SM,
+                                     Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
-using cuda_exec_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, true>;
+using cuda_exec_explicit_async =
+    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
+                                     cuda::global_x<BLOCK_SIZE>,
+                                     CudaDefaultConcretizer,
+                                     BLOCKS_PER_SM,
+                                     true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
-using cuda_exec = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+using cuda_exec =
+    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
+                                     cuda::global_x<BLOCK_SIZE>,
+                                     CudaDefaultConcretizer,
+                                     policy::cuda::MIN_BLOCKS_PER_SM,
+                                     Async>;
 
 template <size_t BLOCK_SIZE>
-using cuda_exec_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+using cuda_exec_async =
+    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
+                                     cuda::global_x<BLOCK_SIZE>,
+                                     CudaDefaultConcretizer,
+                                     policy::cuda::MIN_BLOCKS_PER_SM,
+                                     true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_occ_calc_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer,
+    BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_occ_calc_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer,
+    BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using cuda_exec_occ_calc = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE>
 using cuda_exec_occ_calc_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_occ_max_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaMaxOccupancyConcretizer, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer,
+    BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_occ_max_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaMaxOccupancyConcretizer, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer,
+    BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using cuda_exec_occ_max = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaMaxOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE>
 using cuda_exec_occ_max_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaMaxOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
-
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Fraction, bool Async = false>
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
+
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          typename Fraction,
+          bool Async = false>
 using cuda_exec_occ_fraction_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Fraction>
 using cuda_exec_occ_fraction_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, typename Fraction, bool Async = false>
 using cuda_exec_occ_fraction = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, typename Fraction>
 using cuda_exec_occ_fraction_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
-
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Concretizer, bool Async = false>
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
+
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          typename Concretizer,
+          bool Async = false>
 using cuda_exec_occ_custom_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    Concretizer, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    Concretizer,
+    BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Concretizer>
 using cuda_exec_occ_custom_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    Concretizer, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    Concretizer,
+    BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, typename Concretizer, bool Async = false>
 using cuda_exec_occ_custom = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    Concretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    Concretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, typename Concretizer>
 using cuda_exec_occ_custom_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    Concretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    Concretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_with_reduce_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaReduceDefaultConcretizer, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaReduceDefaultConcretizer,
+    BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_with_reduce_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaReduceDefaultConcretizer, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaReduceDefaultConcretizer,
+    BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using cuda_exec_with_reduce = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaReduceDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaReduceDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE>
 using cuda_exec_with_reduce_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaReduceDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
-
-template <bool with_reduce, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
-using cuda_exec_base_explicit = std::conditional_t<with_reduce,
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaReduceDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
+
+template <bool with_reduce,
+          size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async = false>
+using cuda_exec_base_explicit = std::conditional_t<
+    with_reduce,
     cuda_exec_with_reduce_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>>;
 
 template <bool with_reduce, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
-using cuda_exec_base_explicit_async = std::conditional_t<with_reduce,
+using cuda_exec_base_explicit_async = std::conditional_t<
+    with_reduce,
     cuda_exec_with_reduce_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
     cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>>;
 
 template <bool with_reduce, size_t BLOCK_SIZE, bool Async = false>
-using cuda_exec_base = std::conditional_t<with_reduce,
-    cuda_exec_with_reduce<BLOCK_SIZE, Async>,
-    cuda_exec<BLOCK_SIZE, Async>>;
+using cuda_exec_base =
+    std::conditional_t<with_reduce,
+                       cuda_exec_with_reduce<BLOCK_SIZE, Async>,
+                       cuda_exec<BLOCK_SIZE, Async>>;
 
 template <bool with_reduce, size_t BLOCK_SIZE>
-using cuda_exec_base_async = std::conditional_t<with_reduce,
-    cuda_exec_with_reduce_async<BLOCK_SIZE>,
-    cuda_exec_async<BLOCK_SIZE>>;
+using cuda_exec_base_async =
+    std::conditional_t<with_reduce,
+                       cuda_exec_with_reduce_async<BLOCK_SIZE>,
+                       cuda_exec_async<BLOCK_SIZE>>;
 
 
 // policies usable with WorkGroup
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
-using cuda_work_explicit = policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
+          bool Async           = false>
+using cuda_work_explicit =
+    policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
 
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
-using cuda_work_explicit_async = policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>;
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
+using cuda_work_explicit_async =
+    policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
-using cuda_work = policy::cuda::cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+using cuda_work = policy::cuda::
+    cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE>
-using cuda_work_async = policy::cuda::cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+using cuda_work_async = policy::cuda::
+    cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 using policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average;
 
@@ -1410,10 +1578,10 @@ using policy::cuda::cuda_atomic_explicit;
 
 
 // policies usable with reducers
-template < cuda::reduce_algorithm algorithm,
-           cuda::block_communication_mode comm_mode,
-           size_t replication = named_usage::unspecified,
-           size_t atomic_stride = named_usage::unspecified >
+template <cuda::reduce_algorithm algorithm,
+          cuda::block_communication_mode comm_mode,
+          size_t replication   = named_usage::unspecified,
+          size_t atomic_stride = named_usage::unspecified>
 using cuda_reduce_tuning = policy::cuda::cuda_reduce_policy<
     cuda::ReduceTuning<algorithm, comm_mode, replication, atomic_stride>>;
 
@@ -1436,35 +1604,41 @@ using cuda_reduce_tuning = policy::cuda::cuda_reduce_policy<
 //                 a cache shared by the whole device to avoid having to use
 //                 device scope fences. This improves performance on some HW but
 //                 is more difficult to code correctly.
-using cuda_reduce_device_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::combine_last_block,
-    cuda::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_device_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::combine_last_block,
+                       cuda::block_communication_mode::device_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_block_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::combine_last_block,
-    cuda::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_block_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::combine_last_block,
+                       cuda::block_communication_mode::block_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_device_init_device_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::init_device_combine_atomic_block,
-    cuda::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_atomic_device_init_device_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::init_device_combine_atomic_block,
+                       cuda::block_communication_mode::device_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_device_init_block_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::init_device_combine_atomic_block,
-    cuda::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_atomic_device_init_block_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::init_device_combine_atomic_block,
+                       cuda::block_communication_mode::block_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_host_init_device_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::init_host_combine_atomic_block,
-    cuda::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_atomic_host_init_device_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::init_host_combine_atomic_block,
+                       cuda::block_communication_mode::device_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_host_init_block_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::init_host_combine_atomic_block,
-    cuda::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_atomic_host_init_block_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::init_host_combine_atomic_block,
+                       cuda::block_communication_mode::block_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 
 // Policy for RAJA::Reduce* objects that gives the same answer every time when
 // used in the same way
@@ -1476,25 +1650,26 @@ using cuda_reduce_atomic = cuda_reduce_atomic_host_init_device_fence;
 
 // Policy for RAJA::Reduce* objects that lets you select the default atomic or
 // non-atomic policy with a bool
-template < bool with_atomic >
-using cuda_reduce_base = std::conditional_t<with_atomic, cuda_reduce_atomic, cuda_reduce>;
+template <bool with_atomic>
+using cuda_reduce_base =
+    std::conditional_t<with_atomic, cuda_reduce_atomic, cuda_reduce>;
 
 
 // policies usable with multi_reducers
-template < cuda::multi_reduce_algorithm algorithm,
-           typename SharedAtomicReplicationConcretizer,
-           typename SharedAtomicReplicationIndexer,
-           typename GlobalAtomicReplicationConcretizer,
-           typename GlobalAtomicReplicationIndexer >
-using cuda_multi_reduce_tuning = policy::cuda::cuda_multi_reduce_policy<
-    cuda::MultiReduceTuning<
-      algorithm,
-      cuda::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
-                                    SharedAtomicReplicationIndexer,
-                                    GetOffsetRight<int>>,
-      cuda::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
-                                    GlobalAtomicReplicationIndexer,
-                                    GetOffsetLeft<int>>>>;
+template <cuda::multi_reduce_algorithm algorithm,
+          typename SharedAtomicReplicationConcretizer,
+          typename SharedAtomicReplicationIndexer,
+          typename GlobalAtomicReplicationConcretizer,
+          typename GlobalAtomicReplicationIndexer>
+using cuda_multi_reduce_tuning =
+    policy::cuda::cuda_multi_reduce_policy<cuda::MultiReduceTuning<
+        algorithm,
+        cuda::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
+                                      SharedAtomicReplicationIndexer,
+                                      GetOffsetRight<int>>,
+        cuda::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
+                                      GlobalAtomicReplicationIndexer,
+                                      GetOffsetLeft<int>>>>;
 
 // Policies for RAJA::MultiReduce* objects with specific behaviors.
 // - *atomic* policies may use atomics to combine partial results. The
@@ -1508,44 +1683,51 @@ using cuda_multi_reduce_tuning = policy::cuda::cuda_multi_reduce_policy<
 //   This is faster overall than other policies on HW with direct host access
 //   to device memory such as the IBM power 9 + Nvidia V100 Sierra/Lassen
 //   systems.
-using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init = cuda_multi_reduce_tuning<
-    cuda::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
-    cuda::SharedAtomicReplicationMaxPow2Concretizer<
-        cuda::ConstantPreferredReplicationConcretizer<16>>,
-    cuda::thread_xyz<>,
-    cuda::GlobalAtomicReplicationMinPow2Concretizer<
-        cuda::ConstantPreferredReplicationConcretizer<2>>,
-    cuda::warp_global_xyz<>>;
+using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init =
+    cuda_multi_reduce_tuning<
+        cuda::multi_reduce_algorithm::
+            init_host_combine_block_atomic_then_grid_atomic,
+        cuda::SharedAtomicReplicationMaxPow2Concretizer<
+            cuda::ConstantPreferredReplicationConcretizer<16>>,
+        cuda::thread_xyz<>,
+        cuda::GlobalAtomicReplicationMinPow2Concretizer<
+            cuda::ConstantPreferredReplicationConcretizer<2>>,
+        cuda::warp_global_xyz<>>;
 // special policy to test that multi-reducers work if there is not enough shmem
-using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing = cuda_multi_reduce_tuning<
-    cuda::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
-    cuda::SharedAtomicReplicationMaxPow2Concretizer<
-        cuda::ConstantPreferredReplicationConcretizer<0>>,
-    cuda::thread_xyz<>,
-    cuda::GlobalAtomicReplicationMinPow2Concretizer<
-        cuda::ConstantPreferredReplicationConcretizer<2>>,
-    cuda::warp_global_xyz<>>;
+using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing =
+    cuda_multi_reduce_tuning<
+        cuda::multi_reduce_algorithm::
+            init_host_combine_block_atomic_then_grid_atomic,
+        cuda::SharedAtomicReplicationMaxPow2Concretizer<
+            cuda::ConstantPreferredReplicationConcretizer<0>>,
+        cuda::thread_xyz<>,
+        cuda::GlobalAtomicReplicationMinPow2Concretizer<
+            cuda::ConstantPreferredReplicationConcretizer<2>>,
+        cuda::warp_global_xyz<>>;
 //
 using cuda_multi_reduce_atomic_global_host_init = cuda_multi_reduce_tuning<
     cuda::multi_reduce_algorithm::init_host_combine_global_atomic,
-    void, // unused with this algorithm
-    void, // unused with this algorithm
+    void,  // unused with this algorithm
+    void,  // unused with this algorithm
     cuda::GlobalAtomicReplicationMinPow2Concretizer<
         cuda::ConstantPreferredReplicationConcretizer<2>>,
     cuda::warp_global_xyz<>>;
 //
-using cuda_multi_reduce_atomic_global_no_replication_host_init = cuda_multi_reduce_tuning<
-    cuda::multi_reduce_algorithm::init_host_combine_global_atomic,
-    void, // unused with this algorithm
-    void, // unused with this algorithm
-    cuda::GlobalAtomicReplicationMinPow2Concretizer<
-        cuda::ConstantPreferredReplicationConcretizer<1>>,
-    cuda::block_xyz<>>;
-
-// Policy for RAJA::MultiReduce* objects that may use atomics and may not give the
-// same answer every time when used in the same way
-using cuda_multi_reduce_atomic = cuda_multi_reduce_atomic_block_then_atomic_grid_host_init;
-// Similar to above but optimized for low overhead in cases where it is rarely used
+using cuda_multi_reduce_atomic_global_no_replication_host_init =
+    cuda_multi_reduce_tuning<
+        cuda::multi_reduce_algorithm::init_host_combine_global_atomic,
+        void,  // unused with this algorithm
+        void,  // unused with this algorithm
+        cuda::GlobalAtomicReplicationMinPow2Concretizer<
+            cuda::ConstantPreferredReplicationConcretizer<1>>,
+        cuda::block_xyz<>>;
+
+// Policy for RAJA::MultiReduce* objects that may use atomics and may not give
+// the same answer every time when used in the same way
+using cuda_multi_reduce_atomic =
+    cuda_multi_reduce_atomic_block_then_atomic_grid_host_init;
+// Similar to above but optimized for low overhead in cases where it is rarely
+// used
 using cuda_multi_reduce_atomic_low_performance_low_overhead =
     cuda_multi_reduce_atomic_global_no_replication_host_init;
 
@@ -1573,41 +1755,49 @@ using policy::cuda::cuda_thread_masked_loop;
 using policy::cuda::cuda_synchronize;
 
 // policies usable with launch
-template <bool Async, int num_threads = named_usage::unspecified, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
-using cuda_launch_explicit_t = policy::cuda::cuda_launch_explicit_t<Async, num_threads, BLOCKS_PER_SM>;
+template <bool Async,
+          int num_threads      = named_usage::unspecified,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
+using cuda_launch_explicit_t =
+    policy::cuda::cuda_launch_explicit_t<Async, num_threads, BLOCKS_PER_SM>;
 
-//CUDA will emit warnings if we specify BLOCKS_PER_SM but not num of threads
+// CUDA will emit warnings if we specify BLOCKS_PER_SM but not num of threads
 template <bool Async, int num_threads = named_usage::unspecified>
-using cuda_launch_t = policy::cuda::cuda_launch_explicit_t<Async, num_threads,
-    (num_threads == named_usage::unspecified) ? named_usage::unspecified : policy::cuda::MIN_BLOCKS_PER_SM>;
+using cuda_launch_t =
+    policy::cuda::cuda_launch_explicit_t<Async,
+                                         num_threads,
+                                         (num_threads ==
+                                          named_usage::unspecified)
+                                             ? named_usage::unspecified
+                                             : policy::cuda::MIN_BLOCKS_PER_SM>;
 
 
 // policies usable with kernel and launch
-template < typename ... indexers >
-using cuda_indexer_direct = policy::cuda::cuda_indexer<
-    iteration_mapping::Direct,
-    kernel_sync_requirement::none,
-    indexers...>;
+template <typename... indexers>
+using cuda_indexer_direct =
+    policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                               kernel_sync_requirement::none,
+                               indexers...>;
 
-template < typename ... indexers >
+template <typename... indexers>
 using cuda_indexer_loop = policy::cuda::cuda_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
     indexers...>;
 
-template < typename ... indexers >
+template <typename... indexers>
 using cuda_indexer_syncable_loop = policy::cuda::cuda_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::sync,
     indexers...>;
 
-template < typename ... indexers >
-using cuda_flatten_indexer_direct = policy::cuda::cuda_flatten_indexer<
-    iteration_mapping::Direct,
-    kernel_sync_requirement::none,
-    indexers...>;
+template <typename... indexers>
+using cuda_flatten_indexer_direct =
+    policy::cuda::cuda_flatten_indexer<iteration_mapping::Direct,
+                                       kernel_sync_requirement::none,
+                                       indexers...>;
 
-template < typename ... indexers >
+template <typename... indexers>
 using cuda_flatten_indexer_loop = policy::cuda::cuda_flatten_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
@@ -1620,7 +1810,7 @@ using cuda_flatten_indexer_loop = policy::cuda::cuda_flatten_indexer<
  * For example, a segment of size 2000 will not fit, and trigger a runtime
  * error.
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_thread_direct = cuda_indexer_direct<
     cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1635,22 +1825,28 @@ using cuda_thread_yz_direct = cuda_thread_direct<named_dim::y, named_dim::z>;
 using cuda_thread_zx_direct = cuda_thread_direct<named_dim::z, named_dim::x>;
 using cuda_thread_zy_direct = cuda_thread_direct<named_dim::z, named_dim::y>;
 
-using cuda_thread_xyz_direct = cuda_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_thread_xzy_direct = cuda_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_thread_yxz_direct = cuda_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_thread_yzx_direct = cuda_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_thread_zxy_direct = cuda_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_thread_zyx_direct = cuda_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_thread_xyz_direct =
+    cuda_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_thread_xzy_direct =
+    cuda_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_thread_yxz_direct =
+    cuda_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_thread_yzx_direct =
+    cuda_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_thread_zxy_direct =
+    cuda_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_thread_zyx_direct =
+    cuda_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to CUDA threads.
  * Uses block-stride looping to exceed the maximum number of physical threads
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_thread_loop = cuda_indexer_loop<
     cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_thread_syncable_loop = cuda_indexer_syncable_loop<
     cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1665,12 +1861,18 @@ using cuda_thread_yz_loop = cuda_thread_loop<named_dim::y, named_dim::z>;
 using cuda_thread_zx_loop = cuda_thread_loop<named_dim::z, named_dim::x>;
 using cuda_thread_zy_loop = cuda_thread_loop<named_dim::z, named_dim::y>;
 
-using cuda_thread_xyz_loop = cuda_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_thread_xzy_loop = cuda_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_thread_yxz_loop = cuda_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_thread_yzx_loop = cuda_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_thread_zxy_loop = cuda_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_thread_zyx_loop = cuda_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_thread_xyz_loop =
+    cuda_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_thread_xzy_loop =
+    cuda_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_thread_yxz_loop =
+    cuda_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_thread_yzx_loop =
+    cuda_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_thread_zxy_loop =
+    cuda_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_thread_zyx_loop =
+    cuda_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA threads.
@@ -1678,7 +1880,7 @@ using cuda_thread_zyx_loop = cuda_thread_loop<named_dim::z, named_dim::y, named_
  * physical threads to fit all of the direct map requests.
  * Reshapes multiple physical threads into a 1D iteration space
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_flatten_thread_direct = cuda_flatten_indexer_direct<
     cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1686,26 +1888,38 @@ using cuda_flatten_thread_x_direct = cuda_flatten_thread_direct<named_dim::x>;
 using cuda_flatten_thread_y_direct = cuda_flatten_thread_direct<named_dim::y>;
 using cuda_flatten_thread_z_direct = cuda_flatten_thread_direct<named_dim::z>;
 
-using cuda_flatten_thread_xy_direct = cuda_flatten_thread_direct<named_dim::x, named_dim::y>;
-using cuda_flatten_thread_xz_direct = cuda_flatten_thread_direct<named_dim::x, named_dim::z>;
-using cuda_flatten_thread_yx_direct = cuda_flatten_thread_direct<named_dim::y, named_dim::x>;
-using cuda_flatten_thread_yz_direct = cuda_flatten_thread_direct<named_dim::y, named_dim::z>;
-using cuda_flatten_thread_zx_direct = cuda_flatten_thread_direct<named_dim::z, named_dim::x>;
-using cuda_flatten_thread_zy_direct = cuda_flatten_thread_direct<named_dim::z, named_dim::y>;
-
-using cuda_flatten_thread_xyz_direct = cuda_flatten_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_thread_xzy_direct = cuda_flatten_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_thread_yxz_direct = cuda_flatten_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_thread_yzx_direct = cuda_flatten_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_thread_zxy_direct = cuda_flatten_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_thread_zyx_direct = cuda_flatten_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_thread_xy_direct =
+    cuda_flatten_thread_direct<named_dim::x, named_dim::y>;
+using cuda_flatten_thread_xz_direct =
+    cuda_flatten_thread_direct<named_dim::x, named_dim::z>;
+using cuda_flatten_thread_yx_direct =
+    cuda_flatten_thread_direct<named_dim::y, named_dim::x>;
+using cuda_flatten_thread_yz_direct =
+    cuda_flatten_thread_direct<named_dim::y, named_dim::z>;
+using cuda_flatten_thread_zx_direct =
+    cuda_flatten_thread_direct<named_dim::z, named_dim::x>;
+using cuda_flatten_thread_zy_direct =
+    cuda_flatten_thread_direct<named_dim::z, named_dim::y>;
+
+using cuda_flatten_thread_xyz_direct =
+    cuda_flatten_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_thread_xzy_direct =
+    cuda_flatten_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_thread_yxz_direct =
+    cuda_flatten_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_thread_yzx_direct =
+    cuda_flatten_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_thread_zxy_direct =
+    cuda_flatten_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_thread_zyx_direct =
+    cuda_flatten_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA threads.
  * Reshapes multiple physical threads into a 1D iteration space
  * Uses block-stride looping to exceed the maximum number of physical threads
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_flatten_thread_loop = cuda_flatten_indexer_loop<
     cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1713,19 +1927,31 @@ using cuda_flatten_thread_x_loop = cuda_flatten_thread_loop<named_dim::x>;
 using cuda_flatten_thread_y_loop = cuda_flatten_thread_loop<named_dim::y>;
 using cuda_flatten_thread_z_loop = cuda_flatten_thread_loop<named_dim::z>;
 
-using cuda_flatten_thread_xy_loop = cuda_flatten_thread_loop<named_dim::x, named_dim::y>;
-using cuda_flatten_thread_xz_loop = cuda_flatten_thread_loop<named_dim::x, named_dim::z>;
-using cuda_flatten_thread_yx_loop = cuda_flatten_thread_loop<named_dim::y, named_dim::x>;
-using cuda_flatten_thread_yz_loop = cuda_flatten_thread_loop<named_dim::y, named_dim::z>;
-using cuda_flatten_thread_zx_loop = cuda_flatten_thread_loop<named_dim::z, named_dim::x>;
-using cuda_flatten_thread_zy_loop = cuda_flatten_thread_loop<named_dim::z, named_dim::y>;
-
-using cuda_flatten_thread_xyz_loop = cuda_flatten_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_thread_xzy_loop = cuda_flatten_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_thread_yxz_loop = cuda_flatten_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_thread_yzx_loop = cuda_flatten_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_thread_zxy_loop = cuda_flatten_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_thread_zyx_loop = cuda_flatten_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_thread_xy_loop =
+    cuda_flatten_thread_loop<named_dim::x, named_dim::y>;
+using cuda_flatten_thread_xz_loop =
+    cuda_flatten_thread_loop<named_dim::x, named_dim::z>;
+using cuda_flatten_thread_yx_loop =
+    cuda_flatten_thread_loop<named_dim::y, named_dim::x>;
+using cuda_flatten_thread_yz_loop =
+    cuda_flatten_thread_loop<named_dim::y, named_dim::z>;
+using cuda_flatten_thread_zx_loop =
+    cuda_flatten_thread_loop<named_dim::z, named_dim::x>;
+using cuda_flatten_thread_zy_loop =
+    cuda_flatten_thread_loop<named_dim::z, named_dim::y>;
+
+using cuda_flatten_thread_xyz_loop =
+    cuda_flatten_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_thread_xzy_loop =
+    cuda_flatten_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_thread_yxz_loop =
+    cuda_flatten_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_thread_yzx_loop =
+    cuda_flatten_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_thread_zxy_loop =
+    cuda_flatten_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_thread_zyx_loop =
+    cuda_flatten_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -1733,7 +1959,7 @@ using cuda_flatten_thread_zyx_loop = cuda_flatten_thread_loop<named_dim::z, name
  * This is the lowest overhead mapping, but requires that there are enough
  * physical blocks to fit all of the direct map requests.
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_block_direct = cuda_indexer_direct<
     cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1748,22 +1974,28 @@ using cuda_block_yz_direct = cuda_block_direct<named_dim::y, named_dim::z>;
 using cuda_block_zx_direct = cuda_block_direct<named_dim::z, named_dim::x>;
 using cuda_block_zy_direct = cuda_block_direct<named_dim::z, named_dim::y>;
 
-using cuda_block_xyz_direct = cuda_block_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_block_xzy_direct = cuda_block_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_block_yxz_direct = cuda_block_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_block_yzx_direct = cuda_block_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_block_zxy_direct = cuda_block_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_block_zyx_direct = cuda_block_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_block_xyz_direct =
+    cuda_block_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_block_xzy_direct =
+    cuda_block_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_block_yxz_direct =
+    cuda_block_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_block_yzx_direct =
+    cuda_block_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_block_zxy_direct =
+    cuda_block_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_block_zyx_direct =
+    cuda_block_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to CUDA blocks.
  * Uses grid-stride looping to exceed the maximum number of blocks
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_block_loop = cuda_indexer_loop<
     cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_block_syncable_loop = cuda_indexer_syncable_loop<
     cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1778,12 +2010,18 @@ using cuda_block_yz_loop = cuda_block_loop<named_dim::y, named_dim::z>;
 using cuda_block_zx_loop = cuda_block_loop<named_dim::z, named_dim::x>;
 using cuda_block_zy_loop = cuda_block_loop<named_dim::z, named_dim::y>;
 
-using cuda_block_xyz_loop = cuda_block_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_block_xzy_loop = cuda_block_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_block_yxz_loop = cuda_block_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_block_yzx_loop = cuda_block_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_block_zxy_loop = cuda_block_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_block_zyx_loop = cuda_block_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_block_xyz_loop =
+    cuda_block_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_block_xzy_loop =
+    cuda_block_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_block_yxz_loop =
+    cuda_block_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_block_yzx_loop =
+    cuda_block_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_block_zxy_loop =
+    cuda_block_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_block_zyx_loop =
+    cuda_block_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA blocks.
@@ -1791,7 +2029,7 @@ using cuda_block_zyx_loop = cuda_block_loop<named_dim::z, named_dim::y, named_di
  * physical blocks to fit all of the direct map requests.
  * Reshapes multiple physical blocks into a 1D iteration space
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_flatten_block_direct = cuda_flatten_indexer_direct<
     cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1799,26 +2037,38 @@ using cuda_flatten_block_x_direct = cuda_flatten_block_direct<named_dim::x>;
 using cuda_flatten_block_y_direct = cuda_flatten_block_direct<named_dim::y>;
 using cuda_flatten_block_z_direct = cuda_flatten_block_direct<named_dim::z>;
 
-using cuda_flatten_block_xy_direct = cuda_flatten_block_direct<named_dim::x, named_dim::y>;
-using cuda_flatten_block_xz_direct = cuda_flatten_block_direct<named_dim::x, named_dim::z>;
-using cuda_flatten_block_yx_direct = cuda_flatten_block_direct<named_dim::y, named_dim::x>;
-using cuda_flatten_block_yz_direct = cuda_flatten_block_direct<named_dim::y, named_dim::z>;
-using cuda_flatten_block_zx_direct = cuda_flatten_block_direct<named_dim::z, named_dim::x>;
-using cuda_flatten_block_zy_direct = cuda_flatten_block_direct<named_dim::z, named_dim::y>;
-
-using cuda_flatten_block_xyz_direct = cuda_flatten_block_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_block_xzy_direct = cuda_flatten_block_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_block_yxz_direct = cuda_flatten_block_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_block_yzx_direct = cuda_flatten_block_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_block_zxy_direct = cuda_flatten_block_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_block_zyx_direct = cuda_flatten_block_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_block_xy_direct =
+    cuda_flatten_block_direct<named_dim::x, named_dim::y>;
+using cuda_flatten_block_xz_direct =
+    cuda_flatten_block_direct<named_dim::x, named_dim::z>;
+using cuda_flatten_block_yx_direct =
+    cuda_flatten_block_direct<named_dim::y, named_dim::x>;
+using cuda_flatten_block_yz_direct =
+    cuda_flatten_block_direct<named_dim::y, named_dim::z>;
+using cuda_flatten_block_zx_direct =
+    cuda_flatten_block_direct<named_dim::z, named_dim::x>;
+using cuda_flatten_block_zy_direct =
+    cuda_flatten_block_direct<named_dim::z, named_dim::y>;
+
+using cuda_flatten_block_xyz_direct =
+    cuda_flatten_block_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_block_xzy_direct =
+    cuda_flatten_block_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_block_yxz_direct =
+    cuda_flatten_block_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_block_yzx_direct =
+    cuda_flatten_block_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_block_zxy_direct =
+    cuda_flatten_block_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_block_zyx_direct =
+    cuda_flatten_block_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA blocks.
  * Reshapes multiple physical blocks into a 1D iteration space
  * Uses block-stride looping to exceed the maximum number of physical blocks
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_flatten_block_loop = cuda_flatten_indexer_loop<
     cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1826,19 +2076,31 @@ using cuda_flatten_block_x_loop = cuda_flatten_block_loop<named_dim::x>;
 using cuda_flatten_block_y_loop = cuda_flatten_block_loop<named_dim::y>;
 using cuda_flatten_block_z_loop = cuda_flatten_block_loop<named_dim::z>;
 
-using cuda_flatten_block_xy_loop = cuda_flatten_block_loop<named_dim::x, named_dim::y>;
-using cuda_flatten_block_xz_loop = cuda_flatten_block_loop<named_dim::x, named_dim::z>;
-using cuda_flatten_block_yx_loop = cuda_flatten_block_loop<named_dim::y, named_dim::x>;
-using cuda_flatten_block_yz_loop = cuda_flatten_block_loop<named_dim::y, named_dim::z>;
-using cuda_flatten_block_zx_loop = cuda_flatten_block_loop<named_dim::z, named_dim::x>;
-using cuda_flatten_block_zy_loop = cuda_flatten_block_loop<named_dim::z, named_dim::y>;
-
-using cuda_flatten_block_xyz_loop = cuda_flatten_block_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_block_xzy_loop = cuda_flatten_block_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_block_yxz_loop = cuda_flatten_block_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_block_yzx_loop = cuda_flatten_block_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_block_zxy_loop = cuda_flatten_block_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_block_zyx_loop = cuda_flatten_block_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_block_xy_loop =
+    cuda_flatten_block_loop<named_dim::x, named_dim::y>;
+using cuda_flatten_block_xz_loop =
+    cuda_flatten_block_loop<named_dim::x, named_dim::z>;
+using cuda_flatten_block_yx_loop =
+    cuda_flatten_block_loop<named_dim::y, named_dim::x>;
+using cuda_flatten_block_yz_loop =
+    cuda_flatten_block_loop<named_dim::y, named_dim::z>;
+using cuda_flatten_block_zx_loop =
+    cuda_flatten_block_loop<named_dim::z, named_dim::x>;
+using cuda_flatten_block_zy_loop =
+    cuda_flatten_block_loop<named_dim::z, named_dim::y>;
+
+using cuda_flatten_block_xyz_loop =
+    cuda_flatten_block_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_block_xzy_loop =
+    cuda_flatten_block_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_block_yxz_loop =
+    cuda_flatten_block_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_block_yzx_loop =
+    cuda_flatten_block_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_block_zxy_loop =
+    cuda_flatten_block_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_block_zyx_loop =
+    cuda_flatten_block_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -1846,9 +2108,11 @@ using cuda_flatten_block_zyx_loop = cuda_flatten_block_loop<named_dim::z, named_
  * This is the lowest overhead mapping, but requires that there are enough
  * physical threads to fit all of the direct map requests.
  */
-template < named_dim ... dims >
-using cuda_global_direct = cuda_indexer_direct<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template <named_dim... dims>
+using cuda_global_direct =
+    cuda_indexer_direct<cuda::IndexGlobal<dims,
+                                          named_usage::unspecified,
+                                          named_usage::unspecified>...>;
 
 using cuda_global_x_direct = cuda_global_direct<named_dim::x>;
 using cuda_global_y_direct = cuda_global_direct<named_dim::y>;
@@ -1861,24 +2125,34 @@ using cuda_global_yz_direct = cuda_global_direct<named_dim::y, named_dim::z>;
 using cuda_global_zx_direct = cuda_global_direct<named_dim::z, named_dim::x>;
 using cuda_global_zy_direct = cuda_global_direct<named_dim::z, named_dim::y>;
 
-using cuda_global_xyz_direct = cuda_global_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_global_xzy_direct = cuda_global_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_global_yxz_direct = cuda_global_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_global_yzx_direct = cuda_global_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_global_zxy_direct = cuda_global_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_global_zyx_direct = cuda_global_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_global_xyz_direct =
+    cuda_global_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_global_xzy_direct =
+    cuda_global_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_global_yxz_direct =
+    cuda_global_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_global_yzx_direct =
+    cuda_global_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_global_zxy_direct =
+    cuda_global_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_global_zyx_direct =
+    cuda_global_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to CUDA global threads.
  * Uses grid-stride looping to exceed the maximum number of global threads
  */
-template < named_dim ... dims >
-using cuda_global_loop = cuda_indexer_loop<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
-
-template < named_dim ... dims >
-using cuda_global_syncable_loop = cuda_indexer_syncable_loop<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template <named_dim... dims>
+using cuda_global_loop =
+    cuda_indexer_loop<cuda::IndexGlobal<dims,
+                                        named_usage::unspecified,
+                                        named_usage::unspecified>...>;
+
+template <named_dim... dims>
+using cuda_global_syncable_loop =
+    cuda_indexer_syncable_loop<cuda::IndexGlobal<dims,
+                                                 named_usage::unspecified,
+                                                 named_usage::unspecified>...>;
 
 using cuda_global_x_loop = cuda_global_loop<named_dim::x>;
 using cuda_global_y_loop = cuda_global_loop<named_dim::y>;
@@ -1891,12 +2165,18 @@ using cuda_global_yz_loop = cuda_global_loop<named_dim::y, named_dim::z>;
 using cuda_global_zx_loop = cuda_global_loop<named_dim::z, named_dim::x>;
 using cuda_global_zy_loop = cuda_global_loop<named_dim::z, named_dim::y>;
 
-using cuda_global_xyz_loop = cuda_global_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_global_xzy_loop = cuda_global_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_global_yxz_loop = cuda_global_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_global_yzx_loop = cuda_global_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_global_zxy_loop = cuda_global_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_global_zyx_loop = cuda_global_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_global_xyz_loop =
+    cuda_global_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_global_xzy_loop =
+    cuda_global_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_global_yxz_loop =
+    cuda_global_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_global_yzx_loop =
+    cuda_global_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_global_zxy_loop =
+    cuda_global_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_global_zyx_loop =
+    cuda_global_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA global threads.
@@ -1904,54 +2184,83 @@ using cuda_global_zyx_loop = cuda_global_loop<named_dim::z, named_dim::y, named_
  * physical global threads to fit all of the direct map requests.
  * Reshapes multiple physical global threads into a 1D iteration space
  */
-template < named_dim ... dims >
-using cuda_flatten_global_direct = cuda_flatten_indexer_direct<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template <named_dim... dims>
+using cuda_flatten_global_direct =
+    cuda_flatten_indexer_direct<cuda::IndexGlobal<dims,
+                                                  named_usage::unspecified,
+                                                  named_usage::unspecified>...>;
 
 using cuda_flatten_global_x_direct = cuda_flatten_global_direct<named_dim::x>;
 using cuda_flatten_global_y_direct = cuda_flatten_global_direct<named_dim::y>;
 using cuda_flatten_global_z_direct = cuda_flatten_global_direct<named_dim::z>;
 
-using cuda_flatten_global_xy_direct = cuda_flatten_global_direct<named_dim::x, named_dim::y>;
-using cuda_flatten_global_xz_direct = cuda_flatten_global_direct<named_dim::x, named_dim::z>;
-using cuda_flatten_global_yx_direct = cuda_flatten_global_direct<named_dim::y, named_dim::x>;
-using cuda_flatten_global_yz_direct = cuda_flatten_global_direct<named_dim::y, named_dim::z>;
-using cuda_flatten_global_zx_direct = cuda_flatten_global_direct<named_dim::z, named_dim::x>;
-using cuda_flatten_global_zy_direct = cuda_flatten_global_direct<named_dim::z, named_dim::y>;
-
-using cuda_flatten_global_xyz_direct = cuda_flatten_global_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_global_xzy_direct = cuda_flatten_global_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_global_yxz_direct = cuda_flatten_global_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_global_yzx_direct = cuda_flatten_global_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_global_zxy_direct = cuda_flatten_global_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_global_zyx_direct = cuda_flatten_global_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_global_xy_direct =
+    cuda_flatten_global_direct<named_dim::x, named_dim::y>;
+using cuda_flatten_global_xz_direct =
+    cuda_flatten_global_direct<named_dim::x, named_dim::z>;
+using cuda_flatten_global_yx_direct =
+    cuda_flatten_global_direct<named_dim::y, named_dim::x>;
+using cuda_flatten_global_yz_direct =
+    cuda_flatten_global_direct<named_dim::y, named_dim::z>;
+using cuda_flatten_global_zx_direct =
+    cuda_flatten_global_direct<named_dim::z, named_dim::x>;
+using cuda_flatten_global_zy_direct =
+    cuda_flatten_global_direct<named_dim::z, named_dim::y>;
+
+using cuda_flatten_global_xyz_direct =
+    cuda_flatten_global_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_global_xzy_direct =
+    cuda_flatten_global_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_global_yxz_direct =
+    cuda_flatten_global_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_global_yzx_direct =
+    cuda_flatten_global_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_global_zxy_direct =
+    cuda_flatten_global_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_global_zyx_direct =
+    cuda_flatten_global_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA global threads.
  * Reshapes multiple physical global threads into a 1D iteration space
- * Uses global thread-stride looping to exceed the maximum number of physical global threads
+ * Uses global thread-stride looping to exceed the maximum number of physical
+ * global threads
  */
-template < named_dim ... dims >
-using cuda_flatten_global_loop = cuda_flatten_indexer_loop<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template <named_dim... dims>
+using cuda_flatten_global_loop =
+    cuda_flatten_indexer_loop<cuda::IndexGlobal<dims,
+                                                named_usage::unspecified,
+                                                named_usage::unspecified>...>;
 
 using cuda_flatten_global_x_loop = cuda_flatten_global_loop<named_dim::x>;
 using cuda_flatten_global_y_loop = cuda_flatten_global_loop<named_dim::y>;
 using cuda_flatten_global_z_loop = cuda_flatten_global_loop<named_dim::z>;
 
-using cuda_flatten_global_xy_loop = cuda_flatten_global_loop<named_dim::x, named_dim::y>;
-using cuda_flatten_global_xz_loop = cuda_flatten_global_loop<named_dim::x, named_dim::z>;
-using cuda_flatten_global_yx_loop = cuda_flatten_global_loop<named_dim::y, named_dim::x>;
-using cuda_flatten_global_yz_loop = cuda_flatten_global_loop<named_dim::y, named_dim::z>;
-using cuda_flatten_global_zx_loop = cuda_flatten_global_loop<named_dim::z, named_dim::x>;
-using cuda_flatten_global_zy_loop = cuda_flatten_global_loop<named_dim::z, named_dim::y>;
-
-using cuda_flatten_global_xyz_loop = cuda_flatten_global_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_global_xzy_loop = cuda_flatten_global_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_global_yxz_loop = cuda_flatten_global_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_global_yzx_loop = cuda_flatten_global_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_global_zxy_loop = cuda_flatten_global_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_global_zyx_loop = cuda_flatten_global_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_global_xy_loop =
+    cuda_flatten_global_loop<named_dim::x, named_dim::y>;
+using cuda_flatten_global_xz_loop =
+    cuda_flatten_global_loop<named_dim::x, named_dim::z>;
+using cuda_flatten_global_yx_loop =
+    cuda_flatten_global_loop<named_dim::y, named_dim::x>;
+using cuda_flatten_global_yz_loop =
+    cuda_flatten_global_loop<named_dim::y, named_dim::z>;
+using cuda_flatten_global_zx_loop =
+    cuda_flatten_global_loop<named_dim::z, named_dim::x>;
+using cuda_flatten_global_zy_loop =
+    cuda_flatten_global_loop<named_dim::z, named_dim::y>;
+
+using cuda_flatten_global_xyz_loop =
+    cuda_flatten_global_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_global_xzy_loop =
+    cuda_flatten_global_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_global_yxz_loop =
+    cuda_flatten_global_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_global_yzx_loop =
+    cuda_flatten_global_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_global_zxy_loop =
+    cuda_flatten_global_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_global_zyx_loop =
+    cuda_flatten_global_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -1959,271 +2268,481 @@ using cuda_flatten_global_zyx_loop = cuda_flatten_global_loop<named_dim::z, name
  * This is the lowest overhead mapping, but requires that there are enough
  * physical threads to fit all of the direct map requests.
  */
-template < int X_BLOCK_SIZE >
-using cuda_thread_size_x_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using cuda_thread_size_y_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using cuda_thread_size_z_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_xy_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_xz_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_yx_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_yz_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_zx_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_zy_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_xyz_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_xzy_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_yxz_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_yzx_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_zxy_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_zyx_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
-using cuda_block_size_x_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
-using cuda_block_size_y_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
-using cuda_block_size_z_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_xy_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_xz_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_yx_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_yz_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_zx_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_zy_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_xyz_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_xzy_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_yxz_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_yzx_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_zxy_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_zyx_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_x_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_y_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_z_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xy_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xz_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yx_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yz_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zx_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zy_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xyz_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xzy_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yxz_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yzx_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zxy_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zyx_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_BLOCK_SIZE>
+using cuda_thread_size_x_direct =
+    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE>
+using cuda_thread_size_y_direct =
+    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE>
+using cuda_thread_size_z_direct =
+    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_xy_direct =
+    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_xz_direct =
+    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_yx_direct =
+    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_yz_direct =
+    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_zx_direct =
+    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_zy_direct =
+    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_xyz_direct =
+    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_xzy_direct =
+    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_yxz_direct =
+    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_yzx_direct =
+    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_zxy_direct =
+    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_zyx_direct =
+    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>>;
+
+
+template <int X_GRID_SIZE>
+using cuda_block_size_x_direct =
+    cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE>
+using cuda_block_size_y_direct =
+    cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE>
+using cuda_block_size_z_direct =
+    cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_xy_direct =
+    cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_xz_direct =
+    cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_yx_direct =
+    cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_yz_direct =
+    cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_zx_direct =
+    cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_zy_direct =
+    cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_xyz_direct =
+    cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                        cuda::block_y<Y_GRID_SIZE>,
+                        cuda::block_z<Z_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_xzy_direct =
+    cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                        cuda::block_z<Z_GRID_SIZE>,
+                        cuda::block_y<Y_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_yxz_direct =
+    cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                        cuda::block_x<X_GRID_SIZE>,
+                        cuda::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_yzx_direct =
+    cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                        cuda::block_z<Z_GRID_SIZE>,
+                        cuda::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_zxy_direct =
+    cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                        cuda::block_x<X_GRID_SIZE>,
+                        cuda::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_zyx_direct =
+    cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                        cuda::block_y<Y_GRID_SIZE>,
+                        cuda::block_x<X_GRID_SIZE>>;
+
+
+template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_x_direct =
+    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_y_direct =
+    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_z_direct =
+    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xy_direct =
+    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xz_direct =
+    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yx_direct =
+    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yz_direct =
+    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zx_direct =
+    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zy_direct =
+    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xyz_direct =
+    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xzy_direct =
+    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yxz_direct =
+    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yzx_direct =
+    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zxy_direct =
+    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zyx_direct =
+    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*!
  * Maps segment indices to CUDA global threads.
  * Uses grid-stride looping to exceed the maximum number of global threads
  */
-template < int X_BLOCK_SIZE >
+template <int X_BLOCK_SIZE>
 using cuda_thread_size_x_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
+template <int Y_BLOCK_SIZE>
 using cuda_thread_size_y_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
+template <int Z_BLOCK_SIZE>
 using cuda_thread_size_z_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>>;
 
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_xy_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_xz_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_yx_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_yz_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_zx_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_zy_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_xyz_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_xzy_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_yxz_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_yzx_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_zxy_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_zyx_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_xy_loop =
+    cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_xz_loop =
+    cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_yx_loop =
+    cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_yz_loop =
+    cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_zx_loop =
+    cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_zy_loop =
+    cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_xyz_loop =
+    cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_xzy_loop =
+    cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_yxz_loop =
+    cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_yzx_loop =
+    cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_zxy_loop =
+    cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_zyx_loop =
+    cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>>;
+
+
+template <int X_GRID_SIZE>
 using cuda_block_size_x_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
+template <int Y_GRID_SIZE>
 using cuda_block_size_y_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
+template <int Z_GRID_SIZE>
 using cuda_block_size_z_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>>;
 
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_xy_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_xz_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_yx_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_yz_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_zx_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_zy_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_xyz_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_xzy_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_yxz_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_yzx_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_zxy_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_zyx_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_x_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_y_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_z_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xy_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xz_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yx_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yz_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zx_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zy_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xyz_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xzy_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yxz_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yzx_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zxy_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zyx_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_xy_loop =
+    cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_xz_loop =
+    cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_yx_loop =
+    cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_yz_loop =
+    cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_zx_loop =
+    cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_zy_loop =
+    cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_xyz_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                                                   cuda::block_y<Y_GRID_SIZE>,
+                                                   cuda::block_z<Z_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_xzy_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                                                   cuda::block_z<Z_GRID_SIZE>,
+                                                   cuda::block_y<Y_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_yxz_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                                                   cuda::block_x<X_GRID_SIZE>,
+                                                   cuda::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_yzx_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                                                   cuda::block_z<Z_GRID_SIZE>,
+                                                   cuda::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_zxy_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                                                   cuda::block_x<X_GRID_SIZE>,
+                                                   cuda::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_zyx_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                                                   cuda::block_y<Y_GRID_SIZE>,
+                                                   cuda::block_x<X_GRID_SIZE>>;
+
+
+template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_x_loop =
+    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_y_loop =
+    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_z_loop =
+    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xy_loop =
+    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xz_loop =
+    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yx_loop =
+    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yz_loop =
+    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zx_loop =
+    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zy_loop =
+    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xyz_loop =
+    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xzy_loop =
+    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yxz_loop =
+    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yzx_loop =
+    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zxy_loop =
+    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zyx_loop =
+    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*
  * Maps segment indices to flattened CUDA global threads.
@@ -2231,272 +2750,507 @@ using cuda_global_size_zyx_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE,
  * physical global threads to fit all of the direct map requests.
  * Reshapes multiple physical global threads into a 1D iteration space
  */
-template < int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_x_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_y_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_z_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_xy_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_xz_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_yx_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_yz_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_zx_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_zy_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_xyz_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_xzy_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_yxz_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_yzx_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_zxy_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_zyx_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
-using cuda_flatten_block_size_x_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
-using cuda_flatten_block_size_y_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
-using cuda_flatten_block_size_z_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_xy_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_xz_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_yx_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_yz_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_zx_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_zy_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_xyz_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_xzy_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_yxz_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_yzx_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_zxy_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_zyx_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_x_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_y_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_z_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xy_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xz_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yx_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yz_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zx_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zy_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xyz_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xzy_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yxz_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yzx_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zxy_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zyx_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_x_direct =
+    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_y_direct =
+    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_z_direct =
+    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_xy_direct =
+    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_xz_direct =
+    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_yx_direct =
+    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_yz_direct =
+    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_zx_direct =
+    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_zy_direct =
+    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_xyz_direct =
+    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_xzy_direct =
+    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_yxz_direct =
+    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_yzx_direct =
+    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_zxy_direct =
+    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_zyx_direct =
+    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>>;
+
+
+template <int X_GRID_SIZE>
+using cuda_flatten_block_size_x_direct =
+    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE>
+using cuda_flatten_block_size_y_direct =
+    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE>
+using cuda_flatten_block_size_z_direct =
+    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_xy_direct =
+    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_xz_direct =
+    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_yx_direct =
+    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_yz_direct =
+    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_zx_direct =
+    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_zy_direct =
+    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_xyz_direct =
+    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_xzy_direct =
+    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_yxz_direct =
+    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_yzx_direct =
+    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_zxy_direct =
+    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_zyx_direct =
+    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>>;
+
+
+template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_x_direct =
+    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_y_direct =
+    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_z_direct =
+    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xy_direct =
+    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xz_direct =
+    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yx_direct =
+    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yz_direct =
+    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zx_direct =
+    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zy_direct =
+    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xyz_direct =
+    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xzy_direct =
+    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yxz_direct =
+    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yzx_direct =
+    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zxy_direct =
+    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zyx_direct =
+    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*
  * Maps segment indices to flattened CUDA global threads.
  * Reshapes multiple physical global threads into a 1D iteration space
- * Uses global thread-stride looping to exceed the maximum number of physical global threads
+ * Uses global thread-stride looping to exceed the maximum number of physical
+ * global threads
  */
-template < int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_x_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_y_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_z_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_xy_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_xz_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_yx_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_yz_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_zx_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_zy_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_xyz_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_xzy_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_yxz_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_yzx_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_zxy_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_zyx_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
-using cuda_flatten_block_size_x_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
-using cuda_flatten_block_size_y_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
-using cuda_flatten_block_size_z_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_xy_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_xz_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_yx_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_yz_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_zx_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_zy_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_xyz_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_xzy_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_yxz_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_yzx_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_zxy_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_zyx_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_x_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_y_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_z_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xy_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xz_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yx_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yz_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zx_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zy_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xyz_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xzy_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yxz_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yzx_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zxy_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zyx_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_x_loop =
+    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_y_loop =
+    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_z_loop =
+    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_xy_loop =
+    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_xz_loop =
+    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_yx_loop =
+    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_yz_loop =
+    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_zx_loop =
+    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_zy_loop =
+    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_xyz_loop =
+    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_xzy_loop =
+    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_yxz_loop =
+    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_yzx_loop =
+    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_zxy_loop =
+    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_zyx_loop =
+    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>>;
+
+
+template <int X_GRID_SIZE>
+using cuda_flatten_block_size_x_loop =
+    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE>
+using cuda_flatten_block_size_y_loop =
+    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE>
+using cuda_flatten_block_size_z_loop =
+    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_xy_loop =
+    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_xz_loop =
+    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_yx_loop =
+    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_yz_loop =
+    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_zx_loop =
+    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_zy_loop =
+    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_xyz_loop =
+    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_xzy_loop =
+    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_yxz_loop =
+    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_yzx_loop =
+    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_zxy_loop =
+    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_zyx_loop =
+    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>>;
+
+
+template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_x_loop =
+    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_y_loop =
+    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_z_loop =
+    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xy_loop =
+    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xz_loop =
+    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yx_loop =
+    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yz_loop =
+    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zx_loop =
+    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zy_loop =
+    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xyz_loop =
+    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xzy_loop =
+    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yxz_loop =
+    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yzx_loop =
+    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zxy_loop =
+    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zyx_loop =
+    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 
 /*
diff --git a/include/RAJA/policy/cuda/raja_cudaerrchk.hpp b/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
index 409ec16818..1aa5e84207 100644
--- a/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
+++ b/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
@@ -43,18 +43,18 @@ namespace RAJA
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
-#define cudaErrchk(ans)                            \
-  {                                                \
-    ::RAJA::cudaAssert((ans), __FILE__, __LINE__); \
+#define cudaErrchk(ans)                                                        \
+  {                                                                            \
+    ::RAJA::cudaAssert((ans), __FILE__, __LINE__);                             \
   }
 
-inline void cudaAssert(cudaError_t code,
-                       const char *file,
-                       int line,
-                       bool abort = true)
+inline void
+cudaAssert(cudaError_t code, const char* file, int line, bool abort = true)
 {
-  if (code != cudaSuccess) {
-    if (abort) {
+  if (code != cudaSuccess)
+  {
+    if (abort)
+    {
       std::string msg;
       msg += "CUDAassert: ";
       msg += cudaGetErrorString(code);
@@ -63,9 +63,11 @@ inline void cudaAssert(cudaError_t code,
       msg += ":";
       msg += std::to_string(line);
       throw std::runtime_error(msg);
-    } else {
-      fprintf(stderr, "CUDAassert: %s %s %d\n",
-              cudaGetErrorString(code), file, line);
+    }
+    else
+    {
+      fprintf(stderr, "CUDAassert: %s %s %d\n", cudaGetErrorString(code), file,
+              line);
     }
   }
 }
diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp
index 2b13417531..ae305521b6 100644
--- a/include/RAJA/policy/cuda/reduce.hpp
+++ b/include/RAJA/policy/cuda/reduce.hpp
@@ -44,9 +44,9 @@
 #include "RAJA/policy/cuda/intrinsics.hpp"
 
 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
-  #include "RAJA/policy/desul/atomic.hpp"
+#include "RAJA/policy/desul/atomic.hpp"
 #else
-  #include "RAJA/policy/cuda/atomic.hpp"
+#include "RAJA/policy/cuda/atomic.hpp"
 #endif
 
 #include "RAJA/policy/cuda/policy.hpp"
@@ -66,47 +66,53 @@ template <typename Combiner>
 struct atomic;
 
 template <typename T>
-struct atomic<sum<T>> {
+struct atomic<sum<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicAdd(RAJA::cuda_atomic{}, &val, v);
+    RAJA::atomicAdd(RAJA::cuda_atomic {}, &val, v);
   }
 };
 
 template <typename T>
-struct atomic<min<T>> {
+struct atomic<min<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicMin(RAJA::cuda_atomic{}, &val, v);
+    RAJA::atomicMin(RAJA::cuda_atomic {}, &val, v);
   }
 };
 
 template <typename T>
-struct atomic<max<T>> {
+struct atomic<max<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicMax(RAJA::cuda_atomic{}, &val, v);
+    RAJA::atomicMax(RAJA::cuda_atomic {}, &val, v);
   }
 };
 
 template <typename T>
-struct atomic<and_bit<T>> {
+struct atomic<and_bit<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicAnd(RAJA::cuda_atomic{}, &val, v);
+    RAJA::atomicAnd(RAJA::cuda_atomic {}, &val, v);
   }
 };
 
 template <typename T>
-struct atomic<or_bit<T>> {
+struct atomic<or_bit<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicOr(RAJA::cuda_atomic{}, &val, v);
+    RAJA::atomicOr(RAJA::cuda_atomic {}, &val, v);
   }
 };
 
 template <typename T>
-struct cuda_atomic_available {
+struct cuda_atomic_available
+{
   static constexpr const bool value =
       (std::is_integral<T>::value && (4 == sizeof(T) || 8 == sizeof(T))) ||
       std::is_same<T, float>::value || std::is_same<T, double>::value;
@@ -124,15 +130,19 @@ namespace impl
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, typename Accessor,
-          int replication, int atomic_stride,
-          typename T, typename TempIterator>
+template <typename Combiner,
+          typename Accessor,
+          int replication,
+          int atomic_stride,
+          typename T,
+          typename TempIterator>
 RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
-                                        T identity,
-                                        TempIterator in_device_mem,
-                                        unsigned int* device_count)
+                                                   T identity,
+                                                   TempIterator in_device_mem,
+                                                   unsigned int* device_count)
 {
-  typename TempIterator::template rebind_accessor<Accessor> device_mem(in_device_mem);
+  typename TempIterator::template rebind_accessor<Accessor> device_mem(
+      in_device_mem);
 
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -143,20 +153,22 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
   int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
   int replicationId = blockId % replication;
-  int slotId = blockId / replication;
+  int slotId        = blockId / replication;
 
-  int maxNumSlots = (numBlocks + replication - 1) / replication;
+  int maxNumSlots       = (numBlocks + replication - 1) / replication;
   unsigned int numSlots = (numBlocks / replication) +
-      ((replicationId < (numBlocks % replication)) ? 1 : 0);
+                          ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
   int atomicOffset = replicationId * atomic_stride;
-  int beginSlots = replicationId * maxNumSlots;
-  int blockSlot = beginSlots + slotId;
+  int beginSlots   = replicationId * maxNumSlots;
+  int blockSlot    = beginSlots + slotId;
 
   T temp = block_reduce<Combiner>(val, identity);
 
-  if (numSlots <= 1u) {
-    if (threadId == 0) {
+  if (numSlots <= 1u)
+  {
+    if (threadId == 0)
+    {
       val = temp;
     }
     return (threadId == 0) ? replicationId : replication;
@@ -164,33 +176,36 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
 
   // one thread per block writes to device_mem
   bool isLastBlock = false;
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     device_mem.set(blockSlot, temp);
     // ensure write visible to all threadblocks
     Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots-1))
-    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots-1));
-    isLastBlock = (old_count == (numSlots-1));
+    unsigned int old_count =
+        ::atomicInc(&device_count[atomicOffset], (numSlots - 1));
+    isLastBlock = (old_count == (numSlots - 1));
   }
 
   // returns non-zero value if any thread passes in a non-zero value
   isLastBlock = __syncthreads_or(isLastBlock);
 
   // last block accumulates values from device_mem
-  if (isLastBlock) {
+  if (isLastBlock)
+  {
     temp = identity;
     Accessor::fence_acquire();
 
-    for (unsigned int i = threadId;
-                      i < numSlots;
-                      i += numThreads) {
-      Combiner{}(temp, device_mem.get(beginSlots + i));
+    for (unsigned int i = threadId; i < numSlots; i += numThreads)
+    {
+      Combiner {}(temp, device_mem.get(beginSlots + i));
     }
 
     temp = block_reduce<Combiner>(temp, identity);
 
     // one thread returns value
-    if (threadId == 0) {
+    if (threadId == 0)
+    {
       val = temp;
     }
   }
@@ -198,72 +213,92 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
   return (isLastBlock && threadId == 0) ? replicationId : replication;
 }
 
-namespace expt {
+namespace expt
+{
 
 template <typename ThreadIterationGetter, typename Combiner, typename T>
 RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 {
   const int numThreads = ThreadIterationGetter::size();
-  const int threadId = ThreadIterationGetter::index();
+  const int threadId   = ThreadIterationGetter::index();
 
-  const int warpId = threadId % RAJA::policy::cuda::device_constants.WARP_SIZE;
+  const int warpId  = threadId % RAJA::policy::cuda::device_constants.WARP_SIZE;
   const int warpNum = threadId / RAJA::policy::cuda::device_constants.WARP_SIZE;
 
   T temp = val;
 
-  if (numThreads % RAJA::policy::cuda::device_constants.WARP_SIZE == 0) {
+  if (numThreads % RAJA::policy::cuda::device_constants.WARP_SIZE == 0)
+  {
 
     // reduce each warp
-    for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2)
+    {
       T rhs = RAJA::cuda::impl::shfl_xor_sync(temp, i);
-      temp = Combiner{}(temp, rhs);
+      temp  = Combiner {}(temp, rhs);
     }
-
-  } else {
+  }
+  else
+  {
 
     // reduce each warp
-    for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2)
+    {
       int srcLane = threadId ^ i;
-      T rhs = RAJA::cuda::impl::shfl_sync(temp, srcLane);
+      T rhs       = RAJA::cuda::impl::shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
-        temp = Combiner{}(temp, rhs);
+      if (srcLane < numThreads)
+      {
+        temp = Combiner {}(temp, rhs);
       }
     }
   }
 
-  static_assert(RAJA::policy::cuda::device_constants.MAX_WARPS <= RAJA::policy::cuda::device_constants.WARP_SIZE,
-               "Max Warps must be less than or equal to Warp Size for this algorithm to work");
+  static_assert(RAJA::policy::cuda::device_constants.MAX_WARPS <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
+                "Max Warps must be less than or equal to Warp Size for this "
+                "algorithm to work");
 
   // reduce per warp values
-  if (numThreads > RAJA::policy::cuda::device_constants.WARP_SIZE) {
+  if (numThreads > RAJA::policy::cuda::device_constants.WARP_SIZE)
+  {
 
     // Need to separate declaration and initialization for clang-cuda
-    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS>)];
+    __shared__ unsigned char
+        tmpsd[sizeof(RAJA::detail::SoAArray<
+                     T, RAJA::policy::cuda::device_constants.MAX_WARPS>)];
 
     // Partial placement new: Should call new(tmpsd) here but recasting memory
     // to avoid calling constructor/destructor in shared memory.
-    RAJA::detail::SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS> * sd = reinterpret_cast<RAJA::detail::SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS> *>(tmpsd);
+    RAJA::detail::SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS>*
+        sd = reinterpret_cast<RAJA::detail::SoAArray<
+            T, RAJA::policy::cuda::device_constants.MAX_WARPS>*>(tmpsd);
 
     // write per warp values to shared memory
-    if (warpId == 0) {
+    if (warpId == 0)
+    {
       sd->set(warpNum, temp);
     }
 
     __syncthreads();
 
-    if (warpNum == 0) {
+    if (warpNum == 0)
+    {
 
       // read per warp values
-      if (warpId * RAJA::policy::cuda::device_constants.WARP_SIZE < numThreads) {
+      if (warpId * RAJA::policy::cuda::device_constants.WARP_SIZE < numThreads)
+      {
         temp = sd->get(warpId);
-      } else {
+      }
+      else
+      {
         temp = identity;
       }
 
-      for (int i = 1; i < RAJA::policy::cuda::device_constants.MAX_WARPS; i *= 2) {
+      for (int i = 1; i < RAJA::policy::cuda::device_constants.MAX_WARPS;
+           i *= 2)
+      {
         T rhs = RAJA::cuda::impl::shfl_xor_sync(temp, i);
-        temp = Combiner{}(temp, rhs);
+        temp  = Combiner {}(temp, rhs);
       }
     }
 
@@ -275,66 +310,77 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 
 
 template <typename GlobalIterationGetter, typename OP, typename T>
-RAJA_DEVICE RAJA_INLINE void grid_reduce( T * device_target,
-                                          T val,
-                                          RAJA::detail::SoAPtr<T,RAJA::cuda::device_mempool_type> device_mem,
-                                          unsigned int* device_count)
+RAJA_DEVICE RAJA_INLINE void
+grid_reduce(T* device_target,
+            T val,
+            RAJA::detail::SoAPtr<T, RAJA::cuda::device_mempool_type> device_mem,
+            unsigned int* device_count)
 {
-  using BlockIterationGetter = typename get_index_block<GlobalIterationGetter>::type;
-  using ThreadIterationGetter = typename get_index_thread<GlobalIterationGetter>::type;
+  using BlockIterationGetter =
+      typename get_index_block<GlobalIterationGetter>::type;
+  using ThreadIterationGetter =
+      typename get_index_thread<GlobalIterationGetter>::type;
 
-  const int numBlocks = BlockIterationGetter::size();
-  const int numThreads = ThreadIterationGetter::size();
+  const int numBlocks            = BlockIterationGetter::size();
+  const int numThreads           = ThreadIterationGetter::size();
   const unsigned int wrap_around = numBlocks - 1;
 
-  const int blockId = BlockIterationGetter::index();
+  const int blockId  = BlockIterationGetter::index();
   const int threadId = ThreadIterationGetter::index();
 
   T temp = block_reduce<ThreadIterationGetter, OP>(val, OP::identity());
 
   // one thread per block writes to device_mem
   bool lastBlock = false;
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     device_mem.set(blockId, temp);
     // ensure write visible to all threadblocks
     __threadfence();
     // increment counter, (wraps back to zero if old count == wrap_around)
     unsigned int old_count = ::atomicInc(device_count, wrap_around);
-    lastBlock = (old_count == wrap_around);
+    lastBlock              = (old_count == wrap_around);
   }
 
   // returns non-zero value if any thread passes in a non-zero value
   lastBlock = __syncthreads_or(lastBlock);
 
   // last block accumulates values from device_mem
-  if (lastBlock) {
+  if (lastBlock)
+  {
     temp = OP::identity();
     __threadfence();
 
-    for (int i = threadId; i < numBlocks; i += numThreads) {
-      temp = OP{}(temp, device_mem.get(i));
+    for (int i = threadId; i < numBlocks; i += numThreads)
+    {
+      temp = OP {}(temp, device_mem.get(i));
     }
 
     temp = block_reduce<ThreadIterationGetter, OP>(temp, OP::identity());
 
     // one thread returns value
-    if (threadId == 0) {
+    if (threadId == 0)
+    {
       *device_target = temp;
     }
   }
 }
 
-} //  namespace expt
+}  //  namespace expt
 
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, typename Accessor,
-          int replication, int atomic_stride, typename T>
-RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
-                                               T identity,
-                                               T* device_mem,
-                                               unsigned int* device_count)
+template <typename Combiner,
+          typename Accessor,
+          int replication,
+          int atomic_stride,
+          typename T>
+RAJA_DEVICE RAJA_INLINE int
+grid_reduce_atomic_device_init(T& val,
+                               T identity,
+                               T* device_mem,
+                               unsigned int* device_count)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -343,24 +389,28 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
                 (gridDim.x * gridDim.y) * blockIdx.z;
   int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
-  int replicationId = (blockId%replication);
-  int atomicOffset = replicationId*atomic_stride;
+  int replicationId = (blockId % replication);
+  int atomicOffset  = replicationId * atomic_stride;
 
   unsigned int numSlots = (numBlocks / replication) +
-      ((replicationId < (numBlocks % replication)) ? 1 : 0);
+                          ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
-  if (numSlots <= 1u) {
+  if (numSlots <= 1u)
+  {
     T temp = block_reduce<Combiner>(val, identity);
-    if (threadId == 0) {
+    if (threadId == 0)
+    {
       val = temp;
     }
     return (threadId == 0) ? replicationId : replication;
   }
 
   // the first block of each replication initializes device_mem
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     unsigned int old_val = ::atomicCAS(&device_count[atomicOffset], 0u, 1u);
-    if (old_val == 0u) {
+    if (old_val == 0u)
+    {
       Accessor::set(device_mem, atomicOffset, identity);
       Accessor::fence_release();
       ::atomicAdd(&device_count[atomicOffset], 1u);
@@ -371,19 +421,22 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
 
   // one thread per block performs an atomic on device_mem
   bool isLastBlock = false;
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     // wait for device_mem to be initialized
     while (::atomicAdd(&device_count[atomicOffset], 0u) < 2u)
       ;
     Accessor::fence_acquire();
-    RAJA::reduce::cuda::atomic<Combiner>{}(device_mem[atomicOffset], temp);
+    RAJA::reduce::cuda::atomic<Combiner> {}(device_mem[atomicOffset], temp);
     Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots+1))
-    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots+1));
-    isLastBlock = (old_count == (numSlots+1));
+    unsigned int old_count =
+        ::atomicInc(&device_count[atomicOffset], (numSlots + 1));
+    isLastBlock = (old_count == (numSlots + 1));
 
     // the last block for each replication gets the value from device_mem
-    if (isLastBlock) {
+    if (isLastBlock)
+    {
       Accessor::fence_acquire();
       val = Accessor::get(device_mem, atomicOffset);
     }
@@ -394,9 +447,8 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
 
 //! reduce values in block into thread 0 and atomically combines into device_mem
 template <typename Combiner, int replication, int atomic_stride, typename T>
-RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
-                                                            T identity,
-                                                            T* device_mem)
+RAJA_DEVICE RAJA_INLINE void
+grid_reduce_atomic_host_init(T& val, T identity, T* device_mem)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -404,14 +456,15 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
   int blockId = blockIdx.x + gridDim.x * blockIdx.y +
                 (gridDim.x * gridDim.y) * blockIdx.z;
 
-  int replicationId = (blockId%replication);
-  int atomicOffset = replicationId*atomic_stride;
+  int replicationId = (blockId % replication);
+  int atomicOffset  = replicationId * atomic_stride;
 
   T temp = block_reduce<Combiner>(val, identity);
 
   // one thread per block performs an atomic on device_mem
-  if (threadId == 0 && temp != identity) {
-    RAJA::reduce::cuda::atomic<Combiner>{}(device_mem[atomicOffset], temp);
+  if (threadId == 0 && temp != identity)
+  {
+    RAJA::reduce::cuda::atomic<Combiner> {}(device_mem[atomicOffset], temp);
   }
 }
 
@@ -424,12 +477,14 @@ class PinnedTally
 {
 public:
   //! Object put in Pinned memory with value and pointer to next Node
-  struct Node {
+  struct Node
+  {
     Node* next;
     T values[num_slots];
   };
   //! Object per resource to keep track of pinned memory nodes
-  struct ResourceNode {
+  struct ResourceNode
+  {
     ResourceNode* next;
     ::RAJA::resources::Cuda res;
     Node* node_list;
@@ -482,14 +537,19 @@ class PinnedTally
 
     const ResourceNodeIterator& operator++()
     {
-      if (m_n->next) {
+      if (m_n->next)
+      {
         m_n = m_n->next;
-      } else if (m_rn->next) {
+      }
+      else if (m_rn->next)
+      {
         m_rn = m_rn->next;
-        m_n = m_rn->node_list;
-      } else {
+        m_n  = m_rn->node_list;
+      }
+      else
+      {
         m_rn = nullptr;
-        m_n = nullptr;
+        m_n  = nullptr;
       }
       return *this;
     }
@@ -501,7 +561,7 @@ class PinnedTally
       return ret;
     }
 
-    auto operator*() -> T(&)[num_slots] { return m_n->values; }
+    auto operator*() -> T (&)[num_slots] { return m_n->values; }
 
     bool operator==(const ResourceNodeIterator& rhs) const
     {
@@ -538,25 +598,27 @@ class PinnedTally
   ResourceNodeIterator end() { return {nullptr, nullptr}; }
 
   //! get new value for use in resource
-  auto new_value(::RAJA::resources::Cuda res) -> T(&)[num_slots]
+  auto new_value(::RAJA::resources::Cuda res) -> T (&)[num_slots]
   {
 #if defined(RAJA_ENABLE_OPENMP)
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
     ResourceNode* rn = resource_list;
-    while (rn) {
+    while (rn)
+    {
       if (rn->res.get_stream() == res.get_stream()) break;
       rn = rn->next;
     }
-    if (!rn) {
-      rn = (ResourceNode*)malloc(sizeof(ResourceNode));
-      rn->next = resource_list;
-      rn->res = res;
+    if (!rn)
+    {
+      rn            = (ResourceNode*)malloc(sizeof(ResourceNode));
+      rn->next      = resource_list;
+      rn->res       = res;
       rn->node_list = nullptr;
       resource_list = rn;
     }
-    Node* n = mempool::getInstance().template malloc<Node>(1);
-    n->next = rn->node_list;
+    Node* n       = mempool::getInstance().template malloc<Node>(1);
+    n->next       = rn->node_list;
     rn->node_list = n;
     return n->values;
   }
@@ -565,7 +627,8 @@ class PinnedTally
   void synchronize_resources()
   {
     auto end = resourceEnd();
-    for (auto r = resourceBegin(); r != end; ++r) {
+    for (auto r = resourceBegin(); r != end; ++r)
+    {
       ::RAJA::cuda::synchronize(*r);
     }
   }
@@ -573,10 +636,12 @@ class PinnedTally
   //! all values used in all resources
   void free_list()
   {
-    while (resource_list) {
+    while (resource_list)
+    {
       ResourceNode* rn = resource_list;
-      while (rn->node_list) {
-        Node* n = rn->node_list;
+      while (rn->node_list)
+      {
+        Node* n       = rn->node_list;
         rn->node_list = n->next;
         mempool::getInstance().free(n);
       }
@@ -605,12 +670,15 @@ class PinnedTally
 
 //! Reduction data for Cuda Offload -- stores value, host pointer, and device
 //! pointer
-template <typename Combiner, typename Accessor, typename T,
-          size_t replication, size_t atomic_stride>
+template <typename Combiner,
+          typename Accessor,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
 struct ReduceLastBlock_Data
 {
   using tally_mempool_type = pinned_mempool_type;
-  using data_mempool_type = device_mempool_type;
+  using data_mempool_type  = device_mempool_type;
   using count_mempool_type = device_zeroed_mempool_type;
 
   static constexpr size_t tally_slots = replication;
@@ -621,7 +689,7 @@ struct ReduceLastBlock_Data
   RAJA::detail::SoAPtr<T, data_mempool_type> device;
   bool owns_device_pointer;
 
-  ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()){}
+  ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()) {}
 
   /*! \brief create from a default value and offload information
    *
@@ -629,31 +697,30 @@ struct ReduceLastBlock_Data
    */
 
   ReduceLastBlock_Data(T initValue, T identity_)
-      : value{initValue},
-        identity{identity_},
-        device_count{nullptr},
-        device{},
-        owns_device_pointer{false}
-  {
-  }
+      : value {initValue},
+        identity {identity_},
+        device_count {nullptr},
+        device {},
+        owns_device_pointer {false}
+  {}
 
   RAJA_HOST_DEVICE
   ReduceLastBlock_Data(const ReduceLastBlock_Data& other)
-      : value{other.identity},
-        identity{other.identity},
-        device_count{other.device_count},
-        device{other.device},
-        owns_device_pointer{false}
-  {
-  }
+      : value {other.identity},
+        identity {other.identity},
+        device_count {other.device_count},
+        device {other.device},
+        owns_device_pointer {false}
+  {}
 
   ReduceLastBlock_Data& operator=(const ReduceLastBlock_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r) {
+    for (size_t r = 0; r < tally_slots; ++r)
+    {
       output[r] = identity;
     }
     return &output[0];
@@ -665,10 +732,12 @@ struct ReduceLastBlock_Data
   {
     T temp = value;
 
-    size_t replicationId = impl::grid_reduce_last_block<
-        Combiner, Accessor, replication, atomic_stride>(
-          temp, identity, device, device_count);
-    if (replicationId != replication) {
+    size_t replicationId =
+        impl::grid_reduce_last_block<Combiner, Accessor, replication,
+                                     atomic_stride>(temp, identity, device,
+                                                    device_count);
+    if (replicationId != replication)
+    {
       output[replicationId] = temp;
     }
   }
@@ -678,13 +747,15 @@ struct ReduceLastBlock_Data
   bool setupForDevice()
   {
     bool act = !device.allocated() && setupReducers();
-    if (act) {
+    if (act)
+    {
       cuda_dim_t gridDim = currentGridDim();
-      size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
+      size_t numBlocks   = gridDim.x * gridDim.y * gridDim.z;
       size_t maxNumSlots = (numBlocks + replication - 1) / replication;
-      device.allocate(maxNumSlots*replication);
-      device_count = count_mempool_type::getInstance()
-                         .template malloc<unsigned int>(replication*atomic_stride);
+      device.allocate(maxNumSlots * replication);
+      device_count =
+          count_mempool_type::getInstance().template malloc<unsigned int>(
+              replication * atomic_stride);
       owns_device_pointer = true;
     }
     return act;
@@ -695,10 +766,11 @@ struct ReduceLastBlock_Data
   bool teardownForDevice()
   {
     bool act = owns_device_pointer;
-    if (act) {
+    if (act)
+    {
       device.deallocate();
       count_mempool_type::getInstance().free(device_count);
-      device_count = nullptr;
+      device_count        = nullptr;
       owns_device_pointer = false;
     }
     return act;
@@ -706,8 +778,10 @@ struct ReduceLastBlock_Data
 };
 
 //! Reduction data for Cuda Offload -- stores value, host pointer
-template <typename Combiner, typename T,
-          size_t replication, size_t atomic_stride>
+template <typename Combiner,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
 struct ReduceAtomicHostInit_Data
 {
   using tally_mempool_type = device_pinned_mempool_type;
@@ -719,32 +793,32 @@ struct ReduceAtomicHostInit_Data
   bool is_setup;
   bool owns_device_pointer;
 
-  ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()){};
+  ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()) {};
 
   ReduceAtomicHostInit_Data(T initValue, T identity_)
-      : value{initValue},
-        identity{identity_},
-        is_setup{false},
-        owns_device_pointer{false}
-  {
-  }
+      : value {initValue},
+        identity {identity_},
+        is_setup {false},
+        owns_device_pointer {false}
+  {}
 
   RAJA_HOST_DEVICE
   ReduceAtomicHostInit_Data(const ReduceAtomicHostInit_Data& other)
-      : value{other.identity},
-        identity{other.identity},
-        is_setup{other.is_setup},
-        owns_device_pointer{false}
-  {
-  }
+      : value {other.identity},
+        identity {other.identity},
+        is_setup {other.is_setup},
+        owns_device_pointer {false}
+  {}
 
-  ReduceAtomicHostInit_Data& operator=(const ReduceAtomicHostInit_Data&) = default;
+  ReduceAtomicHostInit_Data&
+  operator=(const ReduceAtomicHostInit_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r) {
+    for (size_t r = 0; r < tally_slots; ++r)
+    {
       output[r] = identity;
     }
     return &output[0];
@@ -756,9 +830,8 @@ struct ReduceAtomicHostInit_Data
   {
     T temp = value;
 
-    impl::grid_reduce_atomic_host_init<Combiner,
-        replication, atomic_stride>(
-            temp, identity, output);
+    impl::grid_reduce_atomic_host_init<Combiner, replication, atomic_stride>(
+        temp, identity, output);
   }
 
   //! check and setup for device
@@ -766,8 +839,9 @@ struct ReduceAtomicHostInit_Data
   bool setupForDevice()
   {
     bool act = !is_setup && setupReducers();
-    if (act) {
-      is_setup = true;
+    if (act)
+    {
+      is_setup            = true;
       owns_device_pointer = true;
     }
     return act;
@@ -778,8 +852,9 @@ struct ReduceAtomicHostInit_Data
   bool teardownForDevice()
   {
     bool act = owns_device_pointer;
-    if (act) {
-      is_setup = false;
+    if (act)
+    {
+      is_setup            = false;
       owns_device_pointer = false;
     }
     return act;
@@ -787,12 +862,15 @@ struct ReduceAtomicHostInit_Data
 };
 
 //! Reduction data for Cuda Offload -- stores value, host pointer
-template <typename Combiner, typename Accessor, typename T,
-          size_t replication, size_t atomic_stride>
+template <typename Combiner,
+          typename Accessor,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
 struct ReduceAtomicDeviceInit_Data
 {
   using tally_mempool_type = pinned_mempool_type;
-  using data_mempool_type = device_mempool_type;
+  using data_mempool_type  = device_mempool_type;
   using count_mempool_type = device_zeroed_mempool_type;
 
   static constexpr size_t tally_slots = replication;
@@ -803,34 +881,34 @@ struct ReduceAtomicDeviceInit_Data
   T* device;
   bool owns_device_pointer;
 
-  ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()){};
+  ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()) {};
 
   ReduceAtomicDeviceInit_Data(T initValue, T identity_)
-      : value{initValue},
-        identity{identity_},
-        device_count{nullptr},
-        device{nullptr},
-        owns_device_pointer{false}
-  {
-  }
+      : value {initValue},
+        identity {identity_},
+        device_count {nullptr},
+        device {nullptr},
+        owns_device_pointer {false}
+  {}
 
   RAJA_HOST_DEVICE
   ReduceAtomicDeviceInit_Data(const ReduceAtomicDeviceInit_Data& other)
-      : value{other.identity},
-        identity{other.identity},
-        device_count{other.device_count},
-        device{other.device},
-        owns_device_pointer{false}
-  {
-  }
+      : value {other.identity},
+        identity {other.identity},
+        device_count {other.device_count},
+        device {other.device},
+        owns_device_pointer {false}
+  {}
 
-  ReduceAtomicDeviceInit_Data& operator=(const ReduceAtomicDeviceInit_Data&) = default;
+  ReduceAtomicDeviceInit_Data&
+  operator=(const ReduceAtomicDeviceInit_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r) {
+    for (size_t r = 0; r < tally_slots; ++r)
+    {
       output[r] = identity;
     }
     return &output[0];
@@ -842,10 +920,12 @@ struct ReduceAtomicDeviceInit_Data
   {
     T temp = value;
 
-    size_t replicationId = impl::grid_reduce_atomic_device_init<
-        Combiner, Accessor, replication, atomic_stride>(
-          temp, identity, device, device_count);
-    if (replicationId != replication) {
+    size_t replicationId =
+        impl::grid_reduce_atomic_device_init<Combiner, Accessor, replication,
+                                             atomic_stride>(
+            temp, identity, device, device_count);
+    if (replicationId != replication)
+    {
       output[replicationId] = temp;
     }
   }
@@ -855,10 +935,13 @@ struct ReduceAtomicDeviceInit_Data
   bool setupForDevice()
   {
     bool act = !device && setupReducers();
-    if (act) {
-      device = data_mempool_type::getInstance().template malloc<T>(replication*atomic_stride);
-      device_count = count_mempool_type::getInstance()
-                         .template malloc<unsigned int>(replication*atomic_stride);
+    if (act)
+    {
+      device = data_mempool_type::getInstance().template malloc<T>(
+          replication * atomic_stride);
+      device_count =
+          count_mempool_type::getInstance().template malloc<unsigned int>(
+              replication * atomic_stride);
       owns_device_pointer = true;
     }
     return act;
@@ -869,11 +952,12 @@ struct ReduceAtomicDeviceInit_Data
   bool teardownForDevice()
   {
     bool act = owns_device_pointer;
-    if (act) {
+    if (act)
+    {
       data_mempool_type::getInstance().free(device);
       device = nullptr;
       count_mempool_type::getInstance().free(device_count);
-      device_count = nullptr;
+      device_count        = nullptr;
       owns_device_pointer = false;
     }
     return act;
@@ -885,49 +969,77 @@ struct ReduceAtomicDeviceInit_Data
 template <typename Combiner, typename T, typename tuning>
 class Reduce
 {
-  static constexpr size_t replication = (tuning::replication > 0)
-      ? tuning::replication
-      : 1;
-  static constexpr size_t atomic_stride = (tuning::atomic_stride > 0)
-      ? tuning::atomic_stride
-      : ((policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
-        ? RAJA_DIVIDE_CEILING_INT(policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T))
-        : 1);
-
-  using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::block_fence),
+  static constexpr size_t replication =
+      (tuning::replication > 0) ? tuning::replication : 1;
+  static constexpr size_t atomic_stride =
+      (tuning::atomic_stride > 0)
+          ? tuning::atomic_stride
+          : ((policy::cuda::device_constants
+                  .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
+                 ? RAJA_DIVIDE_CEILING_INT(
+                       policy::cuda::device_constants
+                           .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE,
+                       sizeof(T))
+                 : 1);
+
+  using Accessor = std::conditional_t<
+      (tuning::comm_mode == block_communication_mode::block_fence),
       impl::AccessorDeviceScopeUseBlockFence,
-      std::conditional_t<(tuning::comm_mode == block_communication_mode::device_fence),
-        impl::AccessorDeviceScopeUseDeviceFence,
-        void>>;
+      std::conditional_t<(tuning::comm_mode ==
+                          block_communication_mode::device_fence),
+                         impl::AccessorDeviceScopeUseDeviceFence,
+                         void>>;
 
   static constexpr bool atomic_policy =
-      (tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block) ||
+      (tuning::algorithm ==
+       reduce_algorithm::init_device_combine_atomic_block) ||
       (tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block);
-  static constexpr bool atomic_available = RAJA::reduce::cuda::cuda_atomic_available<T>::value;
+  static constexpr bool atomic_available =
+      RAJA::reduce::cuda::cuda_atomic_available<T>::value;
 
   //! cuda reduction data storage class and folding algorithm
-  using reduce_data_type = std::conditional_t<(tuning::algorithm == reduce_algorithm::combine_last_block) ||
-                                              (atomic_policy && !atomic_available),
-      cuda::ReduceLastBlock_Data<Combiner, Accessor, T, replication, atomic_stride>,
-      std::conditional_t<atomic_available,
-        std::conditional_t<(tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block),
-          cuda::ReduceAtomicDeviceInit_Data<Combiner, Accessor, T, replication, atomic_stride>,
-          std::conditional_t<(tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block),
-            cuda::ReduceAtomicHostInit_Data<Combiner, T, replication, atomic_stride>,
-            void>>,
-        void>>;
+  using reduce_data_type = std::conditional_t<
+      (tuning::algorithm == reduce_algorithm::combine_last_block) ||
+          (atomic_policy && !atomic_available),
+      cuda::ReduceLastBlock_Data<Combiner,
+                                 Accessor,
+                                 T,
+                                 replication,
+                                 atomic_stride>,
+      std::conditional_t<
+          atomic_available,
+          std::conditional_t<
+              (tuning::algorithm ==
+               reduce_algorithm::init_device_combine_atomic_block),
+              cuda::ReduceAtomicDeviceInit_Data<Combiner,
+                                                Accessor,
+                                                T,
+                                                replication,
+                                                atomic_stride>,
+              std::conditional_t<
+                  (tuning::algorithm ==
+                   reduce_algorithm::init_host_combine_atomic_block),
+                  cuda::ReduceAtomicHostInit_Data<Combiner,
+                                                  T,
+                                                  replication,
+                                                  atomic_stride>,
+                  void>>,
+          void>>;
 
   static constexpr size_t tally_slots = reduce_data_type::tally_slots;
 
-  using TallyType = PinnedTally<T, tally_slots, typename reduce_data_type::tally_mempool_type>;
+  using TallyType = PinnedTally<T,
+                                tally_slots,
+                                typename reduce_data_type::tally_mempool_type>;
 
   //! union to hold either pointer to PinnedTally or pointer to value
   //  only use list before setup for device and only use val_ptr after
-  union tally_u {
+  union tally_u
+  {
     TallyType* list;
     T* val_ptr;
-    constexpr tally_u(TallyType* l) : list(l){};
-    constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){};
+    constexpr tally_u(TallyType* l) : list(l) {};
+    constexpr tally_u(T* v_ptr) : val_ptr(v_ptr) {};
   };
 
 public:
@@ -936,11 +1048,10 @@ class Reduce
   //! create a reduce object
   //  the original object's parent is itself
   explicit Reduce(T init_val, T identity_ = Combiner::identity())
-      : parent{this},
-        tally_or_val_ptr{new TallyType},
+      : parent {this},
+        tally_or_val_ptr {new TallyType},
         val(init_val, identity_)
-  {
-  }
+  {}
 
   void reset(T in_val, T identity_ = Combiner::identity())
   {
@@ -954,16 +1065,18 @@ class Reduce
   RAJA_HOST_DEVICE
   Reduce(const Reduce& other)
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-      : parent{other.parent},
+      : parent {other.parent},
 #else
-      : parent{&other},
+      : parent {&other},
 #endif
-        tally_or_val_ptr{other.tally_or_val_ptr},
+        tally_or_val_ptr {other.tally_or_val_ptr},
         val(other.val)
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (parent) {
-      if (val.setupForDevice()) {
+    if (parent)
+    {
+      if (val.setupForDevice())
+      {
         tally_or_val_ptr.val_ptr = val.init_grid_vals(
             tally_or_val_ptr.list->new_value(currentResource()));
         parent = nullptr;
@@ -978,25 +1091,35 @@ class Reduce
   ~Reduce()
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (parent == this) {
+    if (parent == this)
+    {
       delete tally_or_val_ptr.list;
       tally_or_val_ptr.list = nullptr;
-    } else if (parent) {
-      if (val.value != val.identity) {
+    }
+    else if (parent)
+    {
+      if (val.value != val.identity)
+      {
 #if defined(RAJA_ENABLE_OPENMP)
         lock_guard<omp::mutex> lock(tally_or_val_ptr.list->m_mutex);
 #endif
         parent->combine(val.value);
       }
-    } else {
-      if (val.teardownForDevice()) {
+    }
+    else
+    {
+      if (val.teardownForDevice())
+      {
         tally_or_val_ptr.val_ptr = nullptr;
       }
     }
 #else
-    if (!parent->parent) {
+    if (!parent->parent)
+    {
       val.grid_reduce(tally_or_val_ptr.val_ptr);
-    } else {
+    }
+    else
+    {
       parent->combine(val.value);
     }
 #endif
@@ -1005,15 +1128,18 @@ class Reduce
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    auto n = tally_or_val_ptr.list->begin();
+    auto n   = tally_or_val_ptr.list->begin();
     auto end = tally_or_val_ptr.list->end();
-    if (n != end) {
+    if (n != end)
+    {
       tally_or_val_ptr.list->synchronize_resources();
       ::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
           reducer(std::move(val.value));
-      for (; n != end; ++n) {
+      for (; n != end; ++n)
+      {
         T(&values)[tally_slots] = *n;
-        for (size_t r = 0; r < tally_slots; ++r) {
+        for (size_t r = 0; r < tally_slots; ++r)
+        {
           reducer.combine(std::move(values[r]));
         }
       }
@@ -1027,7 +1153,7 @@ class Reduce
 
   //! apply reduction (const version) -- still combines internal values
   RAJA_HOST_DEVICE
-  void combine(T other) const { Combiner{}(val.value, other); }
+  void combine(T other) const { Combiner {}(val.value, other); }
 
   /*!
    *  \return reference to the local value
@@ -1137,33 +1263,39 @@ class ReduceMax<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
 //! specialization of ReduceMinLoc for cuda_reduce
 template <typename tuning, typename T, typename IndexType>
 class ReduceMinLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
-    : public cuda::Reduce<RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
-                          RAJA::reduce::detail::ValueLoc<T, IndexType>,
-                          tuning>
+    : public cuda::Reduce<
+          RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
+          RAJA::reduce::detail::ValueLoc<T, IndexType>,
+          tuning>
 {
 
 public:
-  using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType>;
-  using Combiner = RAJA::reduce::min<value_type>;
+  using value_type     = RAJA::reduce::detail::ValueLoc<T, IndexType>;
+  using Combiner       = RAJA::reduce::min<value_type>;
   using NonLocCombiner = RAJA::reduce::min<T>;
-  using Base = cuda::Reduce<Combiner, value_type, tuning>;
+  using Base           = cuda::Reduce<Combiner, value_type, tuning>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMinLoc(T init_val, IndexType init_idx,
+  ReduceMinLoc(T init_val,
+               IndexType init_idx,
                T identity_val = NonLocCombiner::identity(),
-               IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx), value_type(identity_val, identity_idx))
-  {
-  }
+               IndexType identity_idx =
+                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val, identity_idx))
+  {}
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T init_val, IndexType init_idx,
+  void reset(T init_val,
+             IndexType init_idx,
              T identity_val = NonLocCombiner::identity(),
-             IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_idx =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val, identity_idx));
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
@@ -1187,33 +1319,39 @@ class ReduceMinLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
 //! specialization of ReduceMaxLoc for cuda_reduce
 template <typename tuning, typename T, typename IndexType>
 class ReduceMaxLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
-    : public cuda::
-          Reduce<RAJA::reduce::max<RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
-                 RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
-                 tuning>
+    : public cuda::Reduce<
+          RAJA::reduce::max<
+              RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
+          RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
+          tuning>
 {
 public:
-  using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
-  using Combiner = RAJA::reduce::max<value_type>;
+  using value_type     = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
+  using Combiner       = RAJA::reduce::max<value_type>;
   using NonLocCombiner = RAJA::reduce::max<T>;
-  using Base = cuda::Reduce<Combiner, value_type, tuning>;
+  using Base           = cuda::Reduce<Combiner, value_type, tuning>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMaxLoc(T init_val, IndexType init_idx,
+  ReduceMaxLoc(T init_val,
+               IndexType init_idx,
                T identity_val = NonLocCombiner::identity(),
-               IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx), value_type(identity_val, identity_idx))
-  {
-  }
+               IndexType identity_idx =
+                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val, identity_idx))
+  {}
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T init_val, IndexType init_idx,
+  void reset(T init_val,
+             IndexType init_idx,
              T identity_val = NonLocCombiner::identity(),
-             IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_idx =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val, identity_idx));
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
diff --git a/include/RAJA/policy/cuda/scan.hpp b/include/RAJA/policy/cuda/scan.hpp
index 0a9b0bf305..2b60028cb0 100644
--- a/include/RAJA/policy/cuda/scan.hpp
+++ b/include/RAJA/policy/cuda/scan.hpp
@@ -49,40 +49,34 @@ template <typename IterationMapping,
           bool Async,
           typename InputIter,
           typename Function>
-RAJA_INLINE
-resources::EventProxy<resources::Cuda>
-inclusive_inplace(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    InputIter begin,
-    InputIter end,
-    Function binary_op)
+RAJA_INLINE resources::EventProxy<resources::Cuda>
+inclusive_inplace(resources::Cuda cuda_res,
+                  ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                           IterationGetter,
+                                                           Concretizer,
+                                                           BLOCKS_PER_SM,
+                                                           Async>,
+                  InputIter begin,
+                  InputIter end,
+                  Function binary_op)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              begin,
-                                              binary_op,
-                                              len,
-                                              stream));
+                                              temp_storage_bytes, begin, begin,
+                                              binary_op, len, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
   // Run
   cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              begin,
-                                              binary_op,
-                                              len,
-                                              stream));
+                                              temp_storage_bytes, begin, begin,
+                                              binary_op, len, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
@@ -103,43 +97,35 @@ template <typename IterationMapping,
           typename InputIter,
           typename Function,
           typename T>
-RAJA_INLINE
-resources::EventProxy<resources::Cuda>
-exclusive_inplace(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    InputIter begin,
-    InputIter end,
-    Function binary_op,
-    T init)
+RAJA_INLINE resources::EventProxy<resources::Cuda>
+exclusive_inplace(resources::Cuda cuda_res,
+                  ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                           IterationGetter,
+                                                           Concretizer,
+                                                           BLOCKS_PER_SM,
+                                                           Async>,
+                  InputIter begin,
+                  InputIter end,
+                  Function binary_op,
+                  T init)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              begin,
-                                              binary_op,
-                                              init,
-                                              len,
-                                              stream));
+                                              temp_storage_bytes, begin, begin,
+                                              binary_op, init, len, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
   // Run
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              begin,
-                                              binary_op,
-                                              init,
-                                              len,
-                                              stream));
+                                              temp_storage_bytes, begin, begin,
+                                              binary_op, init, len, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
@@ -160,41 +146,33 @@ template <typename IterationMapping,
           typename InputIter,
           typename OutputIter,
           typename Function>
-RAJA_INLINE
-resources::EventProxy<resources::Cuda>
-inclusive(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    InputIter begin,
-    InputIter end,
-    OutputIter out,
-    Function binary_op)
+RAJA_INLINE resources::EventProxy<resources::Cuda>
+inclusive(resources::Cuda cuda_res,
+          ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                   IterationGetter,
+                                                   Concretizer,
+                                                   BLOCKS_PER_SM,
+                                                   Async>,
+          InputIter begin,
+          InputIter end,
+          OutputIter out,
+          Function binary_op)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
-  cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              out,
-                                              binary_op,
-                                              len,
-                                              stream));
+  cudaErrchk(::cub::DeviceScan::InclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
   // Run
-  cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              out,
-                                              binary_op,
-                                              len,
-                                              stream));
+  cudaErrchk(::cub::DeviceScan::InclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
@@ -216,44 +194,36 @@ template <typename IterationMapping,
           typename OutputIter,
           typename Function,
           typename T>
-RAJA_INLINE
-resources::EventProxy<resources::Cuda>
-exclusive(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    InputIter begin,
-    InputIter end,
-    OutputIter out,
-    Function binary_op,
-    T init)
+RAJA_INLINE resources::EventProxy<resources::Cuda>
+exclusive(resources::Cuda cuda_res,
+          ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                   IterationGetter,
+                                                   Concretizer,
+                                                   BLOCKS_PER_SM,
+                                                   Async>,
+          InputIter begin,
+          InputIter end,
+          OutputIter out,
+          Function binary_op,
+          T init)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              out,
-                                              binary_op,
-                                              init,
-                                              len,
-                                              stream));
+                                              temp_storage_bytes, begin, out,
+                                              binary_op, init, len, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
   // Run
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              out,
-                                              binary_op,
-                                              init,
-                                              len,
-                                              stream));
+                                              temp_storage_bytes, begin, out,
+                                              binary_op, init, len, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
diff --git a/include/RAJA/policy/cuda/sort.hpp b/include/RAJA/policy/cuda/sort.hpp
index c5a353b704..61b6f92673 100644
--- a/include/RAJA/policy/cuda/sort.hpp
+++ b/include/RAJA/policy/cuda/sort.hpp
@@ -44,32 +44,44 @@ namespace sort
 /*!
         \brief static assert unimplemented stable sort
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename Iter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                        std::is_pointer<Iter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-stable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    Iter,
-    Iter,
-    Compare)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename Iter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+        std::is_pointer<Iter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<Iter>>>,
+            camp::is_same<Compare,
+                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+stable(resources::Cuda cuda_res,
+       ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                IterationGetter,
+                                                Concretizer,
+                                                BLOCKS_PER_SM,
+                                                Async>,
+       Iter,
+       Iter,
+       Compare)
 {
-  static_assert (std::is_pointer<Iter>::value,
-      "stable_sort<cuda_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<Iter>::value,
+                "stable_sort<cuda_exec> is only implemented for pointers");
   using iterval = RAJA::detail::IterVal<Iter>;
-  static_assert (type_traits::is_arithmetic<iterval>::value,
+  static_assert(
+      type_traits::is_arithmetic<iterval>::value,
       "stable_sort<cuda_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<iterval>>,
-      camp::is_same<Compare, operators::greater<iterval>>>::value,
-      "stable_sort<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(concepts::any_of<
+                    camp::is_same<Compare, operators::less<iterval>>,
+                    camp::is_same<Compare, operators::greater<iterval>>>::value,
+                "stable_sort<cuda_exec> is only implemented for "
+                "RAJA::operators::less or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
@@ -77,26 +89,32 @@ stable(
 /*!
         \brief stable sort given range in ascending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-stable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    Iter begin,
-    Iter end,
-    operators::less<RAJA::detail::IterVal<Iter>>)
+stable(resources::Cuda cuda_res,
+       ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                IterationGetter,
+                                                Concretizer,
+                                                BLOCKS_PER_SM,
+                                                Async>,
+       Iter begin,
+       Iter end,
+       operators::less<RAJA::detail::IterVal<Iter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   using R = RAJA::detail::IterVal<Iter>;
 
-  int len = std::distance(begin, end);
-  int begin_bit=0;
-  int end_bit=sizeof(R)*CHAR_BIT;
+  int len       = std::distance(begin, end);
+  int begin_bit = 0;
+  int end_bit   = sizeof(R) * CHAR_BIT;
 
   // Allocate temporary storage for the output array
   R* d_out = cuda::device_mempool_type::getInstance().malloc<R>(len);
@@ -106,15 +124,11 @@ stable(
   cub::DoubleBuffer<R> d_keys(begin, d_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
-                                              temp_storage_bytes,
-                                              d_keys,
-                                              len,
-                                              begin_bit,
-                                              end_bit,
-                                              stream));
+                                              temp_storage_bytes, d_keys, len,
+                                              begin_bit, end_bit, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
@@ -122,19 +136,17 @@ stable(
 
   // Run
   cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
-                                              temp_storage_bytes,
-                                              d_keys,
-                                              len,
-                                              begin_bit,
-                                              end_bit,
-                                              stream));
+                                              temp_storage_bytes, d_keys, len,
+                                              begin_bit, end_bit, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (d_keys.Current() == d_out) {
+  if (d_keys.Current() == d_out)
+  {
 
     // copy
-    cudaErrchk(cudaMemcpyAsync(begin, d_out, len*sizeof(R), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(begin, d_out, len * sizeof(R), cudaMemcpyDefault,
+                               stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_out);
@@ -147,26 +159,32 @@ stable(
 /*!
         \brief stable sort given range in descending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-stable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    Iter begin,
-    Iter end,
-    operators::greater<RAJA::detail::IterVal<Iter>>)
+stable(resources::Cuda cuda_res,
+       ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                IterationGetter,
+                                                Concretizer,
+                                                BLOCKS_PER_SM,
+                                                Async>,
+       Iter begin,
+       Iter end,
+       operators::greater<RAJA::detail::IterVal<Iter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   using R = RAJA::detail::IterVal<Iter>;
 
-  int len = std::distance(begin, end);
-  int begin_bit=0;
-  int end_bit=sizeof(R)*CHAR_BIT;
+  int len       = std::distance(begin, end);
+  int begin_bit = 0;
+  int end_bit   = sizeof(R) * CHAR_BIT;
 
   // Allocate temporary storage for the output array
   R* d_out = cuda::device_mempool_type::getInstance().malloc<R>(len);
@@ -176,35 +194,29 @@ stable(
   cub::DoubleBuffer<R> d_keys(begin, d_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
-  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
-                                                        temp_storage_bytes,
-                                                        d_keys,
-                                                        len,
-                                                        begin_bit,
-                                                        end_bit,
-                                                        stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
+      stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
 
   // Run
-  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
-                                                        temp_storage_bytes,
-                                                        d_keys,
-                                                        len,
-                                                        begin_bit,
-                                                        end_bit,
-                                                        stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
+      stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (d_keys.Current() == d_out) {
+  if (d_keys.Current() == d_out)
+  {
 
     // copy
-    cudaErrchk(cudaMemcpyAsync(begin, d_out, len*sizeof(R), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(begin, d_out, len * sizeof(R), cudaMemcpyDefault,
+                               stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_out);
@@ -218,32 +230,43 @@ stable(
 /*!
         \brief static assert unimplemented sort
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename Iter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                        std::is_pointer<Iter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-unstable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    Iter,
-    Iter,
-    Compare)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename Iter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+        std::is_pointer<Iter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<Iter>>>,
+            camp::is_same<Compare,
+                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+unstable(resources::Cuda cuda_res,
+         ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                  IterationGetter,
+                                                  Concretizer,
+                                                  BLOCKS_PER_SM,
+                                                  Async>,
+         Iter,
+         Iter,
+         Compare)
 {
-  static_assert (std::is_pointer<Iter>::value,
-      "sort<cuda_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<Iter>::value,
+                "sort<cuda_exec> is only implemented for pointers");
   using iterval = RAJA::detail::IterVal<Iter>;
-  static_assert (type_traits::is_arithmetic<iterval>::value,
-      "sort<cuda_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<iterval>>,
-      camp::is_same<Compare, operators::greater<iterval>>>::value,
-      "sort<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(type_traits::is_arithmetic<iterval>::value,
+                "sort<cuda_exec> is only implemented for arithmetic types");
+  static_assert(concepts::any_of<
+                    camp::is_same<Compare, operators::less<iterval>>,
+                    camp::is_same<Compare, operators::greater<iterval>>>::value,
+                "sort<cuda_exec> is only implemented for RAJA::operators::less "
+                "or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
@@ -251,18 +274,24 @@ unstable(
 /*!
         \brief sort given range in ascending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-unstable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
-    Iter begin,
-    Iter end,
-    operators::less<RAJA::detail::IterVal<Iter>> comp)
+unstable(resources::Cuda cuda_res,
+         ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                  IterationGetter,
+                                                  Concretizer,
+                                                  BLOCKS_PER_SM,
+                                                  Async> p,
+         Iter begin,
+         Iter end,
+         operators::less<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(cuda_res, p, begin, end, comp);
 }
@@ -270,18 +299,24 @@ unstable(
 /*!
         \brief sort given range in descending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-unstable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
-    Iter begin,
-    Iter end,
-    operators::greater<RAJA::detail::IterVal<Iter>> comp)
+unstable(resources::Cuda cuda_res,
+         ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                  IterationGetter,
+                                                  Concretizer,
+                                                  BLOCKS_PER_SM,
+                                                  Async> p,
+         Iter begin,
+         Iter end,
+         operators::greater<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(cuda_res, p, begin, end, comp);
 }
@@ -290,36 +325,52 @@ unstable(
 /*!
         \brief static assert unimplemented stable sort pairs
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                        std::is_pointer<KeyIter>,
-                        std::is_pointer<ValIter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
-stable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    KeyIter,
-    KeyIter,
-    ValIter,
-    Compare)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+        std::is_pointer<KeyIter>,
+        std::is_pointer<ValIter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
+            camp::is_same<
+                Compare,
+                operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
+stable_pairs(resources::Cuda cuda_res,
+             ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                      IterationGetter,
+                                                      Concretizer,
+                                                      BLOCKS_PER_SM,
+                                                      Async>,
+             KeyIter,
+             KeyIter,
+             ValIter,
+             Compare)
 {
-  static_assert (std::is_pointer<KeyIter>::value,
+  static_assert(
+      std::is_pointer<KeyIter>::value,
       "stable_sort_pairs<cuda_exec> is only implemented for pointers");
-  static_assert (std::is_pointer<ValIter>::value,
+  static_assert(
+      std::is_pointer<ValIter>::value,
       "stable_sort_pairs<cuda_exec> is only implemented for pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
-  static_assert (type_traits::is_arithmetic<K>::value,
+  static_assert(
+      type_traits::is_arithmetic<K>::value,
       "stable_sort_pairs<cuda_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<K>>,
-      camp::is_same<Compare, operators::greater<K>>>::value,
-      "stable_sort_pairs<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(
+      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
+                       camp::is_same<Compare, operators::greater<K>>>::value,
+      "stable_sort_pairs<cuda_exec> is only implemented for "
+      "RAJA::operators::less or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
@@ -327,29 +378,37 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
-stable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    operators::less<RAJA::detail::IterVal<KeyIter>>)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
+stable_pairs(resources::Cuda cuda_res,
+             ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                      IterationGetter,
+                                                      Concretizer,
+                                                      BLOCKS_PER_SM,
+                                                      Async>,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             operators::less<RAJA::detail::IterVal<KeyIter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   using K = RAJA::detail::IterVal<KeyIter>;
   using V = RAJA::detail::IterVal<ValIter>;
 
-  int len = std::distance(keys_begin, keys_end);
-  int begin_bit=0;
-  int end_bit=sizeof(K)*CHAR_BIT;
+  int len       = std::distance(keys_begin, keys_end);
+  int begin_bit = 0;
+  int end_bit   = sizeof(K) * CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
   K* d_keys_out = cuda::device_mempool_type::getInstance().malloc<K>(len);
@@ -361,42 +420,36 @@ stable_pairs(
   cub::DoubleBuffer<V> d_vals(vals_begin, d_vals_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
-  cudaErrchk(::cub::DeviceRadixSort::SortPairs(d_temp_storage,
-                                               temp_storage_bytes,
-                                               d_keys,
-                                               d_vals,
-                                               len,
-                                               begin_bit,
-                                               end_bit,
-                                               stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairs(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
 
   // Run
-  cudaErrchk(::cub::DeviceRadixSort::SortPairs(d_temp_storage,
-                                               temp_storage_bytes,
-                                               d_keys,
-                                               d_vals,
-                                               len,
-                                               begin_bit,
-                                               end_bit,
-                                               stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairs(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (d_keys.Current() == d_keys_out) {
+  if (d_keys.Current() == d_keys_out)
+  {
 
     // copy keys
-    cudaErrchk(cudaMemcpyAsync(keys_begin, d_keys_out, len*sizeof(K), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(keys_begin, d_keys_out, len * sizeof(K),
+                               cudaMemcpyDefault, stream));
   }
-  if (d_vals.Current() == d_vals_out) {
+  if (d_vals.Current() == d_vals_out)
+  {
 
     // copy vals
-    cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len * sizeof(V),
+                               cudaMemcpyDefault, stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_keys_out);
@@ -410,29 +463,37 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
-stable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    operators::greater<RAJA::detail::IterVal<KeyIter>>)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
+stable_pairs(resources::Cuda cuda_res,
+             ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                      IterationGetter,
+                                                      Concretizer,
+                                                      BLOCKS_PER_SM,
+                                                      Async>,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             operators::greater<RAJA::detail::IterVal<KeyIter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   using K = RAJA::detail::IterVal<KeyIter>;
   using V = RAJA::detail::IterVal<ValIter>;
 
-  int len = std::distance(keys_begin, keys_end);
-  int begin_bit=0;
-  int end_bit=sizeof(K)*CHAR_BIT;
+  int len       = std::distance(keys_begin, keys_end);
+  int begin_bit = 0;
+  int end_bit   = sizeof(K) * CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
   K* d_keys_out = cuda::device_mempool_type::getInstance().malloc<K>(len);
@@ -444,42 +505,36 @@ stable_pairs(
   cub::DoubleBuffer<V> d_vals(vals_begin, d_vals_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
-  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
-                                                         temp_storage_bytes,
-                                                         d_keys,
-                                                         d_vals,
-                                                         len,
-                                                         begin_bit,
-                                                         end_bit,
-                                                         stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
 
   // Run
-  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
-                                                         temp_storage_bytes,
-                                                         d_keys,
-                                                         d_vals,
-                                                         len,
-                                                         begin_bit,
-                                                         end_bit,
-                                                         stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (d_keys.Current() == d_keys_out) {
+  if (d_keys.Current() == d_keys_out)
+  {
 
     // copy keys
-    cudaErrchk(cudaMemcpyAsync(keys_begin, d_keys_out, len*sizeof(K), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(keys_begin, d_keys_out, len * sizeof(K),
+                               cudaMemcpyDefault, stream));
   }
-  if (d_vals.Current() == d_vals_out) {
+  if (d_vals.Current() == d_vals_out)
+  {
 
     // copy vals
-    cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len * sizeof(V),
+                               cudaMemcpyDefault, stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_keys_out);
@@ -494,36 +549,50 @@ stable_pairs(
 /*!
         \brief static assert unimplemented sort pairs
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                        std::is_pointer<KeyIter>,
-                        std::is_pointer<ValIter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
-unstable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    KeyIter,
-    KeyIter,
-    ValIter,
-    Compare)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+        std::is_pointer<KeyIter>,
+        std::is_pointer<ValIter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
+            camp::is_same<
+                Compare,
+                operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
+unstable_pairs(resources::Cuda cuda_res,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BLOCKS_PER_SM,
+                                                        Async>,
+               KeyIter,
+               KeyIter,
+               ValIter,
+               Compare)
 {
-  static_assert (std::is_pointer<KeyIter>::value,
-      "sort_pairs<cuda_exec> is only implemented for pointers");
-  static_assert (std::is_pointer<ValIter>::value,
-      "sort_pairs<cuda_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<KeyIter>::value,
+                "sort_pairs<cuda_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<ValIter>::value,
+                "sort_pairs<cuda_exec> is only implemented for pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
-  static_assert (type_traits::is_arithmetic<K>::value,
+  static_assert(
+      type_traits::is_arithmetic<K>::value,
       "sort_pairs<cuda_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<K>>,
-      camp::is_same<Compare, operators::greater<K>>>::value,
-      "sort_pairs<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(
+      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
+                       camp::is_same<Compare, operators::greater<K>>>::value,
+      "sort_pairs<cuda_exec> is only implemented for RAJA::operators::less or "
+      "RAJA::operators::greater");
 
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
@@ -531,20 +600,28 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
-unstable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    operators::less<RAJA::detail::IterVal<KeyIter>> comp)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
+unstable_pairs(resources::Cuda cuda_res,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BLOCKS_PER_SM,
+                                                        Async> p,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               operators::less<RAJA::detail::IterVal<KeyIter>> comp)
 {
   return stable_pairs(cuda_res, p, keys_begin, keys_end, vals_begin, comp);
 }
@@ -552,20 +629,28 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
-unstable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    operators::greater<RAJA::detail::IterVal<KeyIter>> comp)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
+unstable_pairs(resources::Cuda cuda_res,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BLOCKS_PER_SM,
+                                                        Async> p,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               operators::greater<RAJA::detail::IterVal<KeyIter>> comp)
 {
   return stable_pairs(cuda_res, p, keys_begin, keys_end, vals_begin, comp);
 }
diff --git a/include/RAJA/policy/desul/atomic.hpp b/include/RAJA/policy/desul/atomic.hpp
index 71bf429079..8844937700 100644
--- a/include/RAJA/policy/desul/atomic.hpp
+++ b/include/RAJA/policy/desul/atomic.hpp
@@ -28,176 +28,131 @@ namespace RAJA
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T
-atomicLoad(AtomicPolicy, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicLoad(AtomicPolicy, T* acc)
 {
-  return desul::atomic_load(acc,
-                            raja_default_desul_order{},
-                            raja_default_desul_scope{});
+  return desul::atomic_load(acc, raja_default_desul_order {},
+                            raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE void
-atomicStore(AtomicPolicy, T *acc, T value)
-{
-  desul::atomic_store(acc,
-                      value,
-                      raja_default_desul_order{},
-                      raja_default_desul_scope{});
+RAJA_HOST_DEVICE RAJA_INLINE void atomicStore(AtomicPolicy, T* acc, T value)
+{
+  desul::atomic_store(acc, value, raja_default_desul_order {},
+                      raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T
-atomicAdd(AtomicPolicy, T *acc, T value)
-{
-  return desul::atomic_fetch_add(acc,
-                                 value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAdd(AtomicPolicy, T* acc, T value)
+{
+  return desul::atomic_fetch_add(acc, value, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T
-atomicSub(AtomicPolicy, T *acc, T value)
-{
-  return desul::atomic_fetch_sub(acc,
-                                 value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+RAJA_HOST_DEVICE RAJA_INLINE T atomicSub(AtomicPolicy, T* acc, T value)
+{
+  return desul::atomic_fetch_sub(acc, value, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMin(AtomicPolicy, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMin(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_min(acc,
-                                 value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  return desul::atomic_fetch_min(acc, value, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMax(AtomicPolicy, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMax(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_max(acc,
-                                 value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  return desul::atomic_fetch_max(acc, value, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(AtomicPolicy, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(AtomicPolicy, T* acc)
 {
-  return desul::atomic_fetch_inc(acc,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  return desul::atomic_fetch_inc(acc, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(AtomicPolicy, T *acc, T val)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(AtomicPolicy, T* acc, T val)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
-  return desul::atomic_fetch_inc_mod(acc,
-                                     val,
-                                     raja_default_desul_order{},
-                                     raja_default_desul_scope{});
+  return desul::atomic_fetch_inc_mod(acc, val, raja_default_desul_order {},
+                                     raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(AtomicPolicy, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(AtomicPolicy, T* acc)
 {
-  return desul::atomic_fetch_dec(acc,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  return desul::atomic_fetch_dec(acc, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(AtomicPolicy, T *acc, T val)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(AtomicPolicy, T* acc, T val)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
-  return desul::atomic_fetch_dec_mod(acc,
-                                     val,
-                                     raja_default_desul_order{},
-                                     raja_default_desul_scope{});
+  return desul::atomic_fetch_dec_mod(acc, val, raja_default_desul_order {},
+                                     raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAnd(AtomicPolicy, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAnd(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_and(acc,
-                                 value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  return desul::atomic_fetch_and(acc, value, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicOr(AtomicPolicy, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicOr(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_or(acc,
-                                value,
-                                raja_default_desul_order{},
-                                raja_default_desul_scope{});
+  return desul::atomic_fetch_or(acc, value, raja_default_desul_order {},
+                                raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicXor(AtomicPolicy, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicXor(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_xor(acc,
-                                 value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  return desul::atomic_fetch_xor(acc, value, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicExchange(AtomicPolicy, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicExchange(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_exchange(acc,
-                                value,
-                                raja_default_desul_order{},
-                                raja_default_desul_scope{});
+  return desul::atomic_exchange(acc, value, raja_default_desul_order {},
+                                raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicCAS(AtomicPolicy, T *acc, T compare, T value)
-{
-  return desul::atomic_compare_exchange(acc,
-                                        compare,
-                                        value,
-                                        raja_default_desul_order{},
-                                        raja_default_desul_scope{});
+RAJA_HOST_DEVICE RAJA_INLINE T
+atomicCAS(AtomicPolicy, T* acc, T compare, T value)
+{
+  return desul::atomic_compare_exchange(acc, compare, value,
+                                        raja_default_desul_order {},
+                                        raja_default_desul_scope {});
 }
 
 }  // namespace RAJA
 
 #endif  // RAJA_ENABLE_DESUL_ATOMICS
-#endif // guard
+#endif  // guard
diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp
index f1f69eab5e..6c829798be 100644
--- a/include/RAJA/policy/hip/MemUtils_HIP.hpp
+++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp
@@ -72,14 +72,15 @@ hipDeviceProp_t& device_prop()
 
 
 //! Allocator for pinned memory for use in basic_mempool
-struct PinnedAllocator {
+struct PinnedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
   {
     void* ptr;
     hipErrchk(hipHostMalloc(&ptr, nbytes,
-        hipHostMallocMapped | hipHostMallocNonCoherent));
+                            hipHostMallocMapped | hipHostMallocNonCoherent));
     return ptr;
   }
 
@@ -92,7 +93,8 @@ struct PinnedAllocator {
 };
 
 //! Allocator for device memory for use in basic_mempool
-struct DeviceAllocator {
+struct DeviceAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -112,7 +114,8 @@ struct DeviceAllocator {
 
 //! Allocator for pre-zeroed device memory for use in basic_mempool
 //  Note: Memory must be zero when returned to mempool
-struct DeviceZeroedAllocator {
+struct DeviceZeroedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -134,7 +137,8 @@ struct DeviceZeroedAllocator {
 };
 
 //! Allocator for device pinned memory for use in basic_mempool
-struct DevicePinnedAllocator {
+struct DevicePinnedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -155,22 +159,25 @@ struct DevicePinnedAllocator {
 using device_mempool_type = basic_mempool::MemPool<DeviceAllocator>;
 using device_zeroed_mempool_type =
     basic_mempool::MemPool<DeviceZeroedAllocator>;
-using device_pinned_mempool_type = basic_mempool::MemPool<DevicePinnedAllocator>;
+using device_pinned_mempool_type =
+    basic_mempool::MemPool<DevicePinnedAllocator>;
 using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
 
 namespace detail
 {
 
 //! struct containing data necessary to coordinate kernel launches with reducers
-struct hipInfo {
+struct hipInfo
+{
   const void* func = nullptr;
-  hip_dim_t gridDim{0, 0, 0};
-  hip_dim_t blockDim{0, 0, 0};
+  hip_dim_t gridDim {0, 0, 0};
+  hip_dim_t blockDim {0, 0, 0};
   size_t* dynamic_smem = nullptr;
-  ::RAJA::resources::Hip res{::RAJA::resources::Hip::HipFromStream(0,0)};
+  ::RAJA::resources::Hip res {::RAJA::resources::Hip::HipFromStream(0, 0)};
   bool setup_reducers = false;
 };
-struct hipStatusInfo : hipInfo {
+struct hipStatusInfo : hipInfo
+{
 #if defined(RAJA_ENABLE_OPENMP)
   omp::mutex lock;
 #endif
@@ -187,10 +194,7 @@ extern hipStatusInfo tl_status;
 extern std::unordered_map<hipStream_t, bool> g_stream_info_map;
 
 RAJA_INLINE
-void synchronize_impl(::RAJA::resources::Hip res)
-{
-  res.wait();
-}
+void synchronize_impl(::RAJA::resources::Hip res) { res.wait(); }
 
 }  // namespace detail
 
@@ -202,13 +206,16 @@ void synchronize()
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   bool synchronize = false;
-  for (auto& val : detail::g_stream_info_map) {
-    if (!val.second) {
+  for (auto& val : detail::g_stream_info_map)
+  {
+    if (!val.second)
+    {
       synchronize = true;
-      val.second = true;
+      val.second  = true;
     }
   }
-  if (synchronize) {
+  if (synchronize)
+  {
     hipErrchk(hipDeviceSynchronize());
   }
 }
@@ -221,12 +228,16 @@ void synchronize(::RAJA::resources::Hip res)
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   auto iter = detail::g_stream_info_map.find(res.get_stream());
-  if (iter != detail::g_stream_info_map.end()) {
-    if (!iter->second) {
+  if (iter != detail::g_stream_info_map.end())
+  {
+    if (!iter->second)
+    {
       iter->second = true;
       detail::synchronize_impl(res);
     }
-  } else {
+  }
+  else
+  {
     RAJA_ABORT_OR_THROW("Cannot synchronize unknown resource.");
   }
 }
@@ -239,30 +250,41 @@ void launch(::RAJA::resources::Hip res, bool async = true)
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   auto iter = detail::g_stream_info_map.find(res.get_stream());
-  if (iter != detail::g_stream_info_map.end()) {
+  if (iter != detail::g_stream_info_map.end())
+  {
     iter->second = !async;
-  } else {
+  }
+  else
+  {
     detail::g_stream_info_map.emplace(res.get_stream(), !async);
   }
-  if (!async) {
+  if (!async)
+  {
     detail::synchronize_impl(res);
   }
 }
 
 //! Launch kernel and indicate resource synchronization status
 RAJA_INLINE
-void launch(const void* func, hip_dim_t gridDim, hip_dim_t blockDim, void** args, size_t shmem,
-            ::RAJA::resources::Hip res, bool async = true, const char *name = nullptr)
+void launch(const void* func,
+            hip_dim_t gridDim,
+            hip_dim_t blockDim,
+            void** args,
+            size_t shmem,
+            ::RAJA::resources::Hip res,
+            bool async       = true,
+            const char* name = nullptr)
 {
-  #if defined(RAJA_ENABLE_ROCTX)
-  if(name) roctxRangePush(name);
-  #else
-    RAJA_UNUSED_VAR(name);
-  #endif
-  hipErrchk(hipLaunchKernel(func, gridDim, blockDim, args, shmem, res.get_stream()));
-  #if defined(RAJA_ENABLE_ROCTX)
-  if(name) roctxRangePop();
-  #endif
+#if defined(RAJA_ENABLE_ROCTX)
+  if (name) roctxRangePush(name);
+#else
+  RAJA_UNUSED_VAR(name);
+#endif
+  hipErrchk(
+      hipLaunchKernel(func, gridDim, blockDim, args, shmem, res.get_stream()));
+#if defined(RAJA_ENABLE_ROCTX)
+  if (name) roctxRangePop();
+#endif
   launch(res, async);
 }
 
@@ -280,9 +302,11 @@ hip_dim_t currentGridDim() { return detail::tl_status.gridDim; }
 
 //! get grid size of current launch
 RAJA_INLINE
-hip_dim_member_t currentGridSize() { return detail::tl_status.gridDim.x *
-                                            detail::tl_status.gridDim.y *
-                                            detail::tl_status.gridDim.z; }
+hip_dim_member_t currentGridSize()
+{
+  return detail::tl_status.gridDim.x * detail::tl_status.gridDim.y *
+         detail::tl_status.gridDim.z;
+}
 
 //! get blockDim of current launch
 RAJA_INLINE
@@ -290,9 +314,11 @@ hip_dim_t currentBlockDim() { return detail::tl_status.blockDim; }
 
 //! get block size of current launch
 RAJA_INLINE
-hip_dim_member_t currentBlockSize() { return detail::tl_status.blockDim.x *
-                                             detail::tl_status.blockDim.y *
-                                             detail::tl_status.blockDim.z; }
+hip_dim_member_t currentBlockSize()
+{
+  return detail::tl_status.blockDim.x * detail::tl_status.blockDim.y *
+         detail::tl_status.blockDim.z;
+}
 
 //! get dynamic shared memory usage for current launch
 RAJA_INLINE
@@ -307,7 +333,8 @@ size_t maxDynamicShmem()
   return func_attr.maxDynamicSharedSizeBytes;
 }
 
-constexpr size_t dynamic_smem_allocation_failure = std::numeric_limits<size_t>::max();
+constexpr size_t dynamic_smem_allocation_failure =
+    std::numeric_limits<size_t>::max();
 
 //! Allocate dynamic shared memory for current launch
 //
@@ -319,24 +346,27 @@ constexpr size_t dynamic_smem_allocation_failure = std::numeric_limits<size_t>::
 //  Returns an offset into dynamic shared memory aligned to align on success,
 //  or dynamic_smem_allocation_failure on failure. Note that asking for 0 memory
 //  takes the failure return path.
-template < typename T, typename GetNFromMax >
-RAJA_INLINE
-size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max, size_t align = alignof(T))
+template <typename T, typename GetNFromMax>
+RAJA_INLINE size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max,
+                                        size_t align = alignof(T))
 {
   const size_t unaligned_shmem = *detail::tl_status.dynamic_smem;
-  const size_t align_offset = ((unaligned_shmem % align) != size_t(0))
-      ? align - (unaligned_shmem % align)
-      : size_t(0);
-  const size_t aligned_shmem = unaligned_shmem + align_offset;
+  const size_t align_offset    = ((unaligned_shmem % align) != size_t(0))
+                                     ? align - (unaligned_shmem % align)
+                                     : size_t(0);
+  const size_t aligned_shmem   = unaligned_shmem + align_offset;
 
   const size_t max_shmem_bytes = maxDynamicShmem() - aligned_shmem;
-  const size_t n_bytes = sizeof(T) *
-      std::forward<GetNFromMax>(get_n_from_max)(max_shmem_bytes / sizeof(T));
+  const size_t n_bytes = sizeof(T) * std::forward<GetNFromMax>(get_n_from_max)(
+                                         max_shmem_bytes / sizeof(T));
 
-  if (size_t(0) < n_bytes && n_bytes <= max_shmem_bytes) {
+  if (size_t(0) < n_bytes && n_bytes <= max_shmem_bytes)
+  {
     *detail::tl_status.dynamic_smem = aligned_shmem + n_bytes;
     return aligned_shmem;
-  } else {
+  }
+  else
+  {
     return dynamic_smem_allocation_failure;
   }
 }
@@ -351,16 +381,17 @@ ::RAJA::resources::Hip currentResource() { return detail::tl_status.res; }
 // their copy constructors. Both look at tl_status to setup per kernel launch
 // resources.
 template <typename LOOP_BODY>
-RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
-    const void* func,
-    hip_dim_t gridDim,
-    hip_dim_t blockDim,
-    size_t& dynamic_smem,
-    ::RAJA::resources::Hip res,
-    LOOP_BODY&& loop_body)
+RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type
+make_launch_body(const void* func,
+                 hip_dim_t gridDim,
+                 hip_dim_t blockDim,
+                 size_t& dynamic_smem,
+                 ::RAJA::resources::Hip res,
+                 LOOP_BODY&& loop_body)
 {
-  ::RAJA::detail::ScopedAssignment<detail::hipInfo> info_sa(detail::tl_status,
-      detail::hipInfo{func, gridDim, blockDim, &dynamic_smem, res, true});
+  ::RAJA::detail::ScopedAssignment<detail::hipInfo> info_sa(
+      detail::tl_status,
+      detail::hipInfo {func, gridDim, blockDim, &dynamic_smem, res, true});
 
   using return_type = typename std::remove_reference<LOOP_BODY>::type;
   return return_type(std::forward<LOOP_BODY>(loop_body));
@@ -375,7 +406,8 @@ static constexpr size_t hip_occupancy_uninitialized_size_t =
 struct HipFixedMaxBlocksData
 {
   int device_sm_per_device = hip::device_prop().multiProcessorCount;
-  int device_max_threads_per_sm = hip::device_prop().maxThreadsPerMultiProcessor;
+  int device_max_threads_per_sm =
+      hip::device_prop().maxThreadsPerMultiProcessor;
 };
 
 //! Get the maximum theoretical occupancy of the device
@@ -391,32 +423,33 @@ HipFixedMaxBlocksData hip_max_blocks()
 struct HipOccMaxBlocksThreadsData
 {
   size_t func_dynamic_shmem_per_block = hip_occupancy_uninitialized_size_t;
-  int func_max_blocks_per_device = hip_occupancy_uninitialized_int;
-  int func_max_threads_per_block = hip_occupancy_uninitialized_int;
+  int func_max_blocks_per_device      = hip_occupancy_uninitialized_int;
+  int func_max_threads_per_block      = hip_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with unknown threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker) >
-RAJA_INLINE
-HipOccMaxBlocksThreadsData hip_occupancy_max_blocks_threads(const void* func,
-    size_t func_dynamic_shmem_per_block)
+template <typename RAJA_UNUSED_ARG(UniqueMarker)>
+RAJA_INLINE HipOccMaxBlocksThreadsData
+hip_occupancy_max_blocks_threads(const void* func,
+                                 size_t func_dynamic_shmem_per_block)
 {
   static thread_local HipOccMaxBlocksThreadsData data;
 
-  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block)
+  {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
     hipErrchk(hipOccupancyMaxPotentialBlockSize(
-        &data.func_max_blocks_per_device, &data.func_max_threads_per_block, func, func_dynamic_shmem_per_block));
+        &data.func_max_blocks_per_device, &data.func_max_threads_per_block,
+        func, func_dynamic_shmem_per_block));
 #else
     RAJA_UNUSED_VAR(func);
-    hipDeviceProp_t& prop = hip::device_prop();
+    hipDeviceProp_t& prop           = hip::device_prop();
     data.func_max_blocks_per_device = prop.multiProcessorCount;
     data.func_max_threads_per_block = 1024;
 #endif
-
   }
 
   return data;
@@ -426,60 +459,70 @@ HipOccMaxBlocksThreadsData hip_occupancy_max_blocks_threads(const void* func,
 struct HipOccMaxBlocksData : HipFixedMaxBlocksData
 {
   size_t func_dynamic_shmem_per_block = hip_occupancy_uninitialized_size_t;
-  int func_threads_per_block = hip_occupancy_uninitialized_int;
-  int func_max_blocks_per_sm = hip_occupancy_uninitialized_int;
+  int func_threads_per_block          = hip_occupancy_uninitialized_int;
+  int func_max_blocks_per_sm          = hip_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with compile time threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block >
-RAJA_INLINE
-HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
-    size_t func_dynamic_shmem_per_block)
+template <typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block>
+RAJA_INLINE HipOccMaxBlocksData
+hip_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block)
 {
   static thread_local HipOccMaxBlocksData data;
 
-  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block)
+  {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
-    data.func_threads_per_block = func_threads_per_block;
+    data.func_threads_per_block       = func_threads_per_block;
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
     hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
+        &data.func_max_blocks_per_sm, func, func_threads_per_block,
+        func_dynamic_shmem_per_block));
 #else
     RAJA_UNUSED_VAR(func);
-    data.func_max_blocks_per_sm = hip::device_prop().maxThreadsPerMultiProcessor/1024;
-    if (data.func_max_blocks_per_sm <= 0) { data.func_max_blocks_per_sm = 1 }
+    data.func_max_blocks_per_sm =
+        hip::device_prop().maxThreadsPerMultiProcessor / 1024;
+    if (data.func_max_blocks_per_sm <= 0)
+    {
+      data.func_max_blocks_per_sm = 1
+    }
 #endif
-
   }
 
   return data;
 }
 
 //! Get the maximum occupancy of a kernel with runtime threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker) >
-RAJA_INLINE
-HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
-    size_t func_dynamic_shmem_per_block, int func_threads_per_block)
+template <typename RAJA_UNUSED_ARG(UniqueMarker)>
+RAJA_INLINE HipOccMaxBlocksData
+hip_occupancy_max_blocks(const void* func,
+                         size_t func_dynamic_shmem_per_block,
+                         int func_threads_per_block)
 {
   static thread_local HipOccMaxBlocksData data;
 
-  if ( data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
-       data.func_threads_per_block != func_threads_per_block ) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
+      data.func_threads_per_block != func_threads_per_block)
+  {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
-    data.func_threads_per_block = func_threads_per_block;
+    data.func_threads_per_block       = func_threads_per_block;
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
     hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
+        &data.func_max_blocks_per_sm, func, func_threads_per_block,
+        func_dynamic_shmem_per_block));
 #else
     RAJA_UNUSED_VAR(func);
-    data.func_max_blocks_per_sm = hip::device_prop().maxThreadsPerMultiProcessor/1024;
-    if (data.func_max_blocks_per_sm <= 0) { data.func_max_blocks_per_sm = 1 }
+    data.func_max_blocks_per_sm =
+        hip::device_prop().maxThreadsPerMultiProcessor / 1024;
+    if (data.func_max_blocks_per_sm <= 0)
+    {
+      data.func_max_blocks_per_sm = 1
+    }
 #endif
-
   }
 
   return data;
@@ -512,14 +555,16 @@ HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
  *
  ******************************************************************************
  */
-template < typename IdxT, typename Concretizer, typename UniqueMarker>
+template <typename IdxT, typename Concretizer, typename UniqueMarker>
 struct ConcretizerImpl
 {
-  ConcretizerImpl(const void* func, size_t func_dynamic_shmem_per_block, IdxT len)
-    : m_func(func)
-    , m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block)
-    , m_len(len)
-  { }
+  ConcretizerImpl(const void* func,
+                  size_t func_dynamic_shmem_per_block,
+                  IdxT len)
+      : m_func(func),
+        m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block),
+        m_len(len)
+  {}
 
   IdxT get_max_block_size() const
   {
@@ -533,10 +578,14 @@ struct ConcretizerImpl
   IdxT get_block_size_to_fit_len(IdxT func_blocks_per_device) const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
-    if (func_threads_per_block <= func_max_threads_per_block) {
+    IdxT func_threads_per_block =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    if (func_threads_per_block <= func_max_threads_per_block)
+    {
       return func_threads_per_block;
-    } else {
+    }
+    else
+    {
       return IdxT(0);
     }
   }
@@ -544,7 +593,8 @@ struct ConcretizerImpl
   //! Get a grid size when block size is specified
   IdxT get_grid_size_to_fit_len(IdxT func_threads_per_block) const
   {
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
     return func_blocks_per_device;
   }
 
@@ -552,16 +602,17 @@ struct ConcretizerImpl
   auto get_block_and_grid_size_to_fit_len() const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
-    return std::make_pair(func_max_threads_per_block,
-                          func_blocks_per_device);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
   }
 
   //! Get a block size when grid size is specified
   IdxT get_block_size_to_fit_device(IdxT func_blocks_per_device) const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    IdxT func_threads_per_block =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
     return std::min(func_threads_per_block, func_max_threads_per_block);
   }
 
@@ -570,8 +621,10 @@ struct ConcretizerImpl
   {
     auto data = hip_occupancy_max_blocks<UniqueMarker>(
         m_func, m_func_dynamic_shmem_per_block, func_threads_per_block);
-    IdxT func_max_blocks_per_device = Concretizer::template get_max_grid_size<IdxT>(data);
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    IdxT func_max_blocks_per_device =
+        Concretizer::template get_max_grid_size<IdxT>(data);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
     return std::min(func_blocks_per_device, func_max_blocks_per_device);
   }
 
@@ -579,9 +632,9 @@ struct ConcretizerImpl
   auto get_block_and_grid_size_to_fit_device() const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_blocks_per_device = this->get_grid_size_to_fit_device(func_max_threads_per_block);
-    return std::make_pair(func_max_threads_per_block,
-                          func_blocks_per_device);
+    IdxT func_blocks_per_device =
+        this->get_grid_size_to_fit_device(func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
   }
 
 private:
diff --git a/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
index 975d26b7ff..5d2e9b69bb 100644
--- a/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
@@ -41,9 +41,9 @@ namespace hip
 
 // global function that creates the value on the device using the
 // factory and writes it into a pinned ptr
-template < typename Factory >
-__global__ void get_value_global(
-    typename Factory::value_type* ptr, Factory factory)
+template <typename Factory>
+__global__ void get_value_global(typename Factory::value_type* ptr,
+                                 Factory factory)
 {
   *ptr = factory();
 }
@@ -52,8 +52,9 @@ __global__ void get_value_global(
 inline void* get_cached_value_ptr(size_t nbytes)
 {
   static size_t cached_nbytes = 0;
-  static void* ptr = nullptr;
-  if (nbytes > cached_nbytes) {
+  static void* ptr            = nullptr;
+  if (nbytes > cached_nbytes)
+  {
     cached_nbytes = 0;
     hipErrchk(hipHostFree(ptr));
     hipErrchk(hipHostMalloc(&ptr, nbytes));
@@ -73,7 +74,7 @@ inline std::mutex& get_value_mutex()
 // get the device function pointer by calling a global function to
 // write it into a pinned ptr, beware different instantiates of this
 // function may run concurrently
-template < typename Factory >
+template <typename Factory>
 inline auto get_value(Factory&& factory)
 {
   using value_type = typename std::decay_t<Factory>::value_type;
@@ -81,8 +82,9 @@ inline auto get_value(Factory&& factory)
 
   auto res = ::camp::resources::Hip::get_default();
   auto ptr = static_cast<value_type*>(get_cached_value_ptr(sizeof(value_type)));
-  auto func = reinterpret_cast<const void*>(&get_value_global<std::decay_t<Factory>>);
-  void *args[] = {(void*)&ptr, (void*)&factory};
+  auto func =
+      reinterpret_cast<const void*>(&get_value_global<std::decay_t<Factory>>);
+  void* args[] = {(void*)&ptr, (void*)&factory};
   hipErrchk(hipLaunchKernel(func, 1, 1, args, 0, res.get_stream()));
   hipErrchk(hipStreamSynchronize(res.get_stream()));
 
@@ -91,7 +93,7 @@ inline auto get_value(Factory&& factory)
 
 // get the device function pointer and store it so it can be used
 // multiple times
-template < typename Factory >
+template <typename Factory>
 inline auto get_cached_value(Factory&& factory)
 {
   static auto value = get_value(std::forward<Factory>(factory));
@@ -101,17 +103,15 @@ inline auto get_cached_value(Factory&& factory)
 }  // namespace hip
 
 /*!
-* Populate and return a Dispatcher object that can be used in device code
-*/
-template < typename T, typename Dispatcher_T, size_t BLOCK_SIZE, bool Async >
+ * Populate and return a Dispatcher object that can be used in device code
+ */
+template <typename T, typename Dispatcher_T, size_t BLOCK_SIZE, bool Async>
 inline const Dispatcher_T* get_Dispatcher(hip_work<BLOCK_SIZE, Async> const&)
 {
-  static Dispatcher_T dispatcher{
-        Dispatcher_T::template makeDispatcher<T>(
-          [](auto&& factory) {
-            return hip::get_cached_value(
-                std::forward<decltype(factory)>(factory));
-          }) };
+  static Dispatcher_T dispatcher {Dispatcher_T::template makeDispatcher<T>(
+      [](auto&& factory) {
+        return hip::get_cached_value(std::forward<decltype(factory)>(factory));
+      })};
   return &dispatcher;
 }
 
diff --git a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
index 26d45d7bd9..dbdcbc7851 100644
--- a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
@@ -36,46 +36,45 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallOrdered<
-        RAJA::hip_exec_async<BLOCK_SIZE>,
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+          typename... Args>
+struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallOrdered<RAJA::hip_exec_async<BLOCK_SIZE>,
+                              RAJA::hip_work<BLOCK_SIZE, Async>,
+                              RAJA::ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
 {
-  using base = WorkRunnerForallOrdered<
-        RAJA::hip_exec_async<BLOCK_SIZE>,
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+  using base = WorkRunnerForallOrdered<RAJA::hip_exec_async<BLOCK_SIZE>,
+                                       RAJA::hip_work<BLOCK_SIZE, Async>,
+                                       RAJA::ordered,
+                                       DISPATCH_POLICY_T,
+                                       ALLOCATOR_T,
+                                       INDEX_T,
+                                       Args...>;
   using base::base;
-  using IndexType = INDEX_T;
+  using IndexType       = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
 
   ///
   /// run the loops in the given work container in order using forall
   /// run all loops asynchronously and synchronize after is necessary
   ///
-  template < typename WorkContainer >
+  template <typename WorkContainer>
   per_run_storage run(WorkContainer const& storage,
-                      typename base::resource_type r, Args... args) const
+                      typename base::resource_type r,
+                      Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -83,8 +82,12 @@ struct WorkRunner<
     IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
 
     // Only synchronize if we had something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
-      if (!Async) { RAJA::hip::synchronize(r); }
+    if (num_loops > 0 && BLOCK_SIZE > 0)
+    {
+      if (!Async)
+      {
+        RAJA::hip::synchronize(r);
+      }
     }
 
     return run_storage;
@@ -95,46 +98,45 @@ struct WorkRunner<
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallReverse<
-        RAJA::hip_exec_async<BLOCK_SIZE>,
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+          typename... Args>
+struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallReverse<RAJA::hip_exec_async<BLOCK_SIZE>,
+                              RAJA::hip_work<BLOCK_SIZE, Async>,
+                              RAJA::reverse_ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
 {
-  using base = WorkRunnerForallReverse<
-        RAJA::hip_exec_async<BLOCK_SIZE>,
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+  using base = WorkRunnerForallReverse<RAJA::hip_exec_async<BLOCK_SIZE>,
+                                       RAJA::hip_work<BLOCK_SIZE, Async>,
+                                       RAJA::reverse_ordered,
+                                       DISPATCH_POLICY_T,
+                                       ALLOCATOR_T,
+                                       INDEX_T,
+                                       Args...>;
   using base::base;
-  using IndexType = INDEX_T;
+  using IndexType       = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
 
   ///
   /// run the loops in the given work container in reverse order using forall
   /// run all loops asynchronously and synchronize after is necessary
   ///
-  template < typename WorkContainer >
+  template <typename WorkContainer>
   per_run_storage run(WorkContainer const& storage,
-                      typename base::resource_type r, Args... args) const
+                      typename base::resource_type r,
+                      Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -142,8 +144,12 @@ struct WorkRunner<
     IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
 
     // Only synchronize if we had something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
-      if (!Async) { RAJA::hip::synchronize(r); }
+    if (num_loops > 0 && BLOCK_SIZE > 0)
+    {
+      if (!Async)
+      {
+        RAJA::hip::synchronize(r);
+      }
     }
 
     return run_storage;
@@ -155,15 +161,17 @@ struct WorkRunner<
  * A body and segment holder for storing loops that will be executed
  * on the device
  */
-template <typename Segment_type, typename LoopBody,
-          typename index_type, typename ... Args>
+template <typename Segment_type,
+          typename LoopBody,
+          typename index_type,
+          typename... Args>
 struct HoldHipDeviceXThreadblockLoop
 {
-  template < typename segment_in, typename body_in >
+  template <typename segment_in, typename body_in>
   HoldHipDeviceXThreadblockLoop(segment_in&& segment, body_in&& body)
-    : m_segment(std::forward<segment_in>(segment))
-    , m_body(std::forward<body_in>(body))
-  { }
+      : m_segment(std::forward<segment_in>(segment)),
+        m_body(std::forward<body_in>(body))
+  {}
 
   RAJA_DEVICE RAJA_INLINE void operator()(Args... args) const
   {
@@ -171,10 +179,11 @@ struct HoldHipDeviceXThreadblockLoop
     // TODO:: decide whether or not to privatize the loop body
     const index_type i_begin = threadIdx.x + blockIdx.x * blockDim.x;
     const index_type stride  = blockDim.x * gridDim.x;
-    const auto begin = m_segment.begin();
-    const auto end   = m_segment.end();
+    const auto begin         = m_segment.begin();
+    const auto end           = m_segment.end();
     const index_type len(end - begin);
-    for ( index_type i = i_begin; i < len; i += stride ) {
+    for (index_type i = i_begin; i < len; i += stride)
+    {
       m_body(begin[i], std::forward<Args>(args)...);
     }
   }
@@ -184,11 +193,11 @@ struct HoldHipDeviceXThreadblockLoop
   LoopBody m_body;
 };
 
-template < size_t BLOCK_SIZE,
-           typename StorageIter,
-           typename value_type,
-           typename index_type,
-           typename ... Args >
+template <size_t BLOCK_SIZE,
+          typename StorageIter,
+          typename value_type,
+          typename index_type,
+          typename... Args>
 __launch_bounds__(BLOCK_SIZE, 1) __global__
     void hip_unordered_y_block_global(StorageIter iter, Args... args)
 {
@@ -205,36 +214,40 @@ __launch_bounds__(BLOCK_SIZE, 1) __global__
  * the x direction, with the number of threads in the x dimension determined
  * by the average number of iterates per loop
  */
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunner<
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    DISPATCH_POLICY_T,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...>
 {
   using exec_policy = RAJA::hip_work<BLOCK_SIZE, Async>;
-  using order_policy = RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average;
+  using order_policy =
+      RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using Allocator = ALLOCATOR_T;
-  using index_type = INDEX_T;
-  using resource_type = resources::Hip;
+  using Allocator       = ALLOCATOR_T;
+  using index_type      = INDEX_T;
+  using resource_type   = resources::Hip;
 
   // The type that will hold the segment and loop body in work storage
-  struct holder_type {
-    template < typename T >
+  struct holder_type
+  {
+    template <typename T>
     using type = HoldHipDeviceXThreadblockLoop<
-        typename camp::at<T, camp::num<0>>::type, // ITERABLE
-        typename camp::at<T, camp::num<1>>::type, // LOOP_BODY
-        index_type, Args...>;
+        typename camp::at<T, camp::num<0>>::type,  // ITERABLE
+        typename camp::at<T, camp::num<1>>::type,  // LOOP_BODY
+        index_type,
+        Args...>;
   };
   ///
-  template < typename T >
+  template <typename T>
   using holder_type_t = typename holder_type::template type<T>;
 
   // The policy indicating where the call function is invoked
@@ -243,21 +256,24 @@ struct WorkRunner<
 
   // The Dispatcher policy with holder_types used internally to handle the
   // ranges and callables passed in by the user.
-  using dispatcher_holder_policy = dispatcher_transform_types_t<dispatch_policy, holder_type>;
+  using dispatcher_holder_policy =
+      dispatcher_transform_types_t<dispatch_policy, holder_type>;
 
-  using dispatcher_type = Dispatcher<Platform::hip, dispatcher_holder_policy, RAJA::hip_work<BLOCK_SIZE, true>, Args...>;
+  using dispatcher_type = Dispatcher<Platform::hip,
+                                     dispatcher_holder_policy,
+                                     RAJA::hip_work<BLOCK_SIZE, true>,
+                                     Args...>;
 
   WorkRunner() = default;
 
-  WorkRunner(WorkRunner const&) = delete;
+  WorkRunner(WorkRunner const&)            = delete;
   WorkRunner& operator=(WorkRunner const&) = delete;
 
-  WorkRunner(WorkRunner && o)
-    : m_total_iterations(o.m_total_iterations)
+  WorkRunner(WorkRunner&& o) : m_total_iterations(o.m_total_iterations)
   {
     o.m_total_iterations = 0;
   }
-  WorkRunner& operator=(WorkRunner && o)
+  WorkRunner& operator=(WorkRunner&& o)
   {
     m_total_iterations = o.m_total_iterations;
 
@@ -267,35 +283,41 @@ struct WorkRunner<
 
   // runner interfaces with storage to enqueue so the runner can get
   // information from the segment and loop at enqueue time
-  template < typename WorkContainer, typename Iterable, typename LoopBody >
-  inline void enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body)
+  template <typename WorkContainer, typename Iterable, typename LoopBody>
+  inline void
+  enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body)
   {
     using Iterator  = camp::decay<decltype(std::begin(iter))>;
     using LOOP_BODY = camp::decay<LoopBody>;
     using ITERABLE  = camp::decay<Iterable>;
-    using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+    using IndexType =
+        camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
     using holder = holder_type_t<camp::list<ITERABLE, LOOP_BODY>>;
 
-    // using true_value_type = typename WorkContainer::template true_value_type<holder>;
+    // using true_value_type = typename WorkContainer::template
+    // true_value_type<holder>;
 
     Iterator begin = std::begin(iter);
-    Iterator end = std::end(iter);
-    IndexType len = std::distance(begin, end);
+    Iterator end   = std::end(iter);
+    IndexType len  = std::distance(begin, end);
 
     // Only launch kernel if we have something to iterate over
-    if (len > 0 && BLOCK_SIZE > 0) {
+    if (len > 0 && BLOCK_SIZE > 0)
+    {
 
       m_total_iterations += len;
 
       //
-      // TODO: Privatize the loop_body, using make_launch_body to setup reductions
+      // TODO: Privatize the loop_body, using make_launch_body to setup
+      // reductions
       //
       // LOOP_BODY body = RAJA::hip::make_launch_body(func,
-      //     gridSize, blockSize, shmem, stream, std::forward<LoopBody>(loop_body));
+      //     gridSize, blockSize, shmem, stream,
+      //     std::forward<LoopBody>(loop_body));
 
       storage.template emplace<holder>(
-          get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy{}),
+          get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy {}),
           std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
     }
   }
@@ -303,37 +325,43 @@ struct WorkRunner<
   // no extra storage required here
   using per_run_storage = int;
 
-  template < typename WorkContainer >
-  per_run_storage run(WorkContainer const& storage, resource_type r, Args... args) const
+  template <typename WorkContainer>
+  per_run_storage
+  run(WorkContainer const& storage, resource_type r, Args... args) const
   {
-    using Iterator  = camp::decay<decltype(std::begin(storage))>;
-    using IndexType = camp::decay<decltype(std::distance(std::begin(storage), std::end(storage)))>;
+    using Iterator   = camp::decay<decltype(std::begin(storage))>;
+    using IndexType  = camp::decay<decltype(std::distance(std::begin(storage),
+                                                          std::end(storage)))>;
     using value_type = typename WorkContainer::value_type;
 
-    per_run_storage run_storage{};
+    per_run_storage run_storage {};
 
-    auto func = hip_unordered_y_block_global<BLOCK_SIZE, Iterator, value_type, index_type, Args...>;
+    auto func = hip_unordered_y_block_global<BLOCK_SIZE, Iterator, value_type,
+                                             index_type, Args...>;
 
     //
     // Compute the requested iteration space size
     //
-    Iterator begin = std::begin(storage);
-    Iterator end = std::end(storage);
+    Iterator begin      = std::begin(storage);
+    Iterator end        = std::end(storage);
     IndexType num_loops = std::distance(begin, end);
 
     // Only launch kernel if we have something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
+    if (num_loops > 0 && BLOCK_SIZE > 0)
+    {
 
-      index_type average_iterations = m_total_iterations / static_cast<index_type>(num_loops);
+      index_type average_iterations =
+          m_total_iterations / static_cast<index_type>(num_loops);
 
       //
       // Compute the number of blocks
       //
       constexpr index_type block_size = static_cast<index_type>(BLOCK_SIZE);
-      hip_dim_t blockSize{static_cast<hip_dim_member_t>(block_size), 1, 1};
-      hip_dim_t gridSize{static_cast<hip_dim_member_t>((average_iterations + block_size - 1) / block_size),
-                          static_cast<hip_dim_member_t>(num_loops),
-                          1};
+      hip_dim_t blockSize {static_cast<hip_dim_member_t>(block_size), 1, 1};
+      hip_dim_t gridSize {
+          static_cast<hip_dim_member_t>((average_iterations + block_size - 1) /
+                                        block_size),
+          static_cast<hip_dim_member_t>(num_loops), 1};
 
       RAJA_FT_BEGIN;
 
@@ -346,8 +374,9 @@ struct WorkRunner<
         //
         // Launch the kernel
         //
-        void* func_args[] = { (void*)&begin, (void*)&args... };
-        RAJA::hip::launch((const void*)func, gridSize, blockSize, func_args, shmem, r, Async);
+        void* func_args[] = {(void*)&begin, (void*)&args...};
+        RAJA::hip::launch((const void*)func, gridSize, blockSize, func_args,
+                          shmem, r, Async);
       }
 
       RAJA_FT_END;
@@ -357,10 +386,7 @@ struct WorkRunner<
   }
 
   // clear any state so ready to be destroyed or reused
-  void clear()
-  {
-    m_total_iterations = 0;
-  }
+  void clear() { m_total_iterations = 0; }
 
 private:
   index_type m_total_iterations = 0;
@@ -369,29 +395,31 @@ struct WorkRunner<
 #if !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported runner types incomplete
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunner<
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
-        RAJA::indirect_function_call_dispatch,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    RAJA::indirect_function_call_dispatch,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...>;
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunner<
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
-        RAJA::indirect_virtual_function_dispatch,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    RAJA::indirect_virtual_function_dispatch,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...>;
 
 #endif
 
diff --git a/include/RAJA/policy/hip/atomic.hpp b/include/RAJA/policy/hip/atomic.hpp
index b4f0d7faa7..60b0871f0d 100644
--- a/include/RAJA/policy/hip/atomic.hpp
+++ b/include/RAJA/policy/hip/atomic.hpp
@@ -49,11 +49,8 @@ namespace RAJA
 namespace detail
 {
 
-using hip_atomicCommon_builtin_types = ::camp::list<
-  int,
-  unsigned int,
-  unsigned long long
->;
+using hip_atomicCommon_builtin_types =
+    ::camp::list<int, unsigned int, unsigned long long>;
 
 /*!
  * Type trait for determining if atomic operators should be implemented
@@ -62,11 +59,11 @@ using hip_atomicCommon_builtin_types = ::camp::list<
  * hip_useBuiltinExchange below.
  */
 template <typename T>
-struct hip_useBuiltinCommon {
-  static constexpr bool value =
-    std::is_same<T, int>::value ||
-    std::is_same<T, unsigned int>::value ||
-    std::is_same<T, unsigned long long>::value;
+struct hip_useBuiltinCommon
+{
+  static constexpr bool value = std::is_same<T, int>::value ||
+                                std::is_same<T, unsigned int>::value ||
+                                std::is_same<T, unsigned long long>::value;
 };
 
 
@@ -78,15 +75,15 @@ struct hip_useBuiltinCommon {
  * below.
  */
 template <typename T>
-struct hip_useReinterpretCommon {
-  static constexpr bool value =
-    !hip_useBuiltinCommon<T>::value &&
-    (sizeof(T) == sizeof(unsigned int) ||
-     sizeof(T) == sizeof(unsigned long long));
+struct hip_useReinterpretCommon
+{
+  static constexpr bool value = !hip_useBuiltinCommon<T>::value &&
+                                (sizeof(T) == sizeof(unsigned int) ||
+                                 sizeof(T) == sizeof(unsigned long long));
 
-  using type =
-    std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                       unsigned int, unsigned long long>;
+  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                  unsigned int,
+                                  unsigned long long>;
 };
 
 
@@ -106,7 +103,7 @@ using hip_useReinterpretCommon_t = typename hip_useReinterpretCommon<T>::type;
  */
 template <typename T,
           std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicOr(T *acc, T value)
+RAJA_INLINE __device__ T hip_atomicOr(T* acc, T value)
 {
   return ::atomicOr(acc, value);
 }
@@ -117,12 +114,12 @@ RAJA_INLINE __device__ T hip_atomicOr(T *acc, T value)
  * using a builtin
  */
 template <typename T>
-struct hip_useBuiltinExchange {
-  static constexpr bool value =
-    std::is_same<T, int>::value ||
-    std::is_same<T, unsigned int>::value ||
-    std::is_same<T, unsigned long long>::value ||
-    std::is_same<T, float>::value;
+struct hip_useBuiltinExchange
+{
+  static constexpr bool value = std::is_same<T, int>::value ||
+                                std::is_same<T, unsigned int>::value ||
+                                std::is_same<T, unsigned long long>::value ||
+                                std::is_same<T, float>::value;
 };
 
 /*!
@@ -130,22 +127,23 @@ struct hip_useBuiltinExchange {
  * by reinterpreting inputs to types that the builtin exchange supports
  */
 template <typename T>
-struct hip_useReinterpretExchange {
-  static constexpr bool value =
-    !hip_useBuiltinExchange<T>::value &&
-    (sizeof(T) == sizeof(unsigned int) ||
-     sizeof(T) == sizeof(unsigned long long));
+struct hip_useReinterpretExchange
+{
+  static constexpr bool value = !hip_useBuiltinExchange<T>::value &&
+                                (sizeof(T) == sizeof(unsigned int) ||
+                                 sizeof(T) == sizeof(unsigned long long));
 
-  using type =
-    std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                       unsigned int, unsigned long long>;
+  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                  unsigned int,
+                                  unsigned long long>;
 };
 
 /*!
  * Alias for determining the integral type of the same size as the given type
  */
 template <typename T>
-using hip_useReinterpretExchange_t = typename hip_useReinterpretExchange<T>::type;
+using hip_useReinterpretExchange_t =
+    typename hip_useReinterpretExchange<T>::type;
 
 /*!
  * Performs an atomic exchange using a builtin function. Stores the new value
@@ -153,7 +151,7 @@ using hip_useReinterpretExchange_t = typename hip_useReinterpretExchange<T>::typ
  */
 template <typename T,
           std::enable_if_t<hip_useBuiltinExchange<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicExchange(T *acc, T value)
+RAJA_INLINE __device__ T hip_atomicExchange(T* acc, T value)
 {
   return ::atomicExch(acc, value);
 }
@@ -164,17 +162,16 @@ RAJA_INLINE __device__ T hip_atomicExchange(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<hip_useReinterpretExchange<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicExchange(T *acc, T value)
+RAJA_INLINE __device__ T hip_atomicExchange(T* acc, T value)
 {
   using R = hip_useReinterpretExchange_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(
-    hip_atomicExchange(reinterpret_cast<R*>(acc),
-                       RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(hip_atomicExchange(
+      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
-#if defined(__has_builtin) && \
+#if defined(__has_builtin) &&                                                  \
     (__has_builtin(__hip_atomic_load) || __has_builtin(__hip_atomic_store))
 
 /*!
@@ -182,10 +179,11 @@ RAJA_INLINE __device__ T hip_atomicExchange(T *acc, T value)
  * using an intrinsic
  */
 template <typename T>
-struct hip_useBuiltinLoad {
+struct hip_useBuiltinLoad
+{
   static constexpr bool value =
-    (std::is_integral<T>::value || std::is_enum<T>::value) &&
-    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+      (std::is_integral<T>::value || std::is_enum<T>::value) &&
+      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
 };
 
 template <typename T>
@@ -197,54 +195,54 @@ using hip_useBuiltinStore = hip_useBuiltinLoad<T>;
  * by reinterpreting inputs to types that intrinsics support
  */
 template <typename T>
-struct hip_useReinterpretLoad {
-  static constexpr bool value =
-    !std::is_integral<T>::value &&
-    !std::is_enum<T>::value &&
-    ((sizeof(T) == 1
+struct hip_useReinterpretLoad
+{
+  static constexpr bool value = !std::is_integral<T>::value &&
+                                !std::is_enum<T>::value &&
+                                ((sizeof(T) == 1
 #if !defined(UINT8_MAX)
-      && sizeof(unsigned char) == 1
+                                  && sizeof(unsigned char) == 1
 #endif
-     ) ||
-     (sizeof(T) == 2
+                                  ) ||
+                                 (sizeof(T) == 2
 #if !defined(UINT16_MAX)
-      && sizeof(unsigned short) == 2
+                                  && sizeof(unsigned short) == 2
 #endif
-     ) ||
-     (sizeof(T) == 4
+                                  ) ||
+                                 (sizeof(T) == 4
 #if !defined(UINT32_MAX)
-      && sizeof(unsigned int) == 4
+                                  && sizeof(unsigned int) == 4
 #endif
-     ) ||
-     (sizeof(T) == 8
+                                  ) ||
+                                 (sizeof(T) == 8
 #if !defined(UINT64_MAX)
-      && sizeof(unsigned long long) == 8
+                                  && sizeof(unsigned long long) == 8
 #endif
-     ));
+                                  ));
 
   using type =
-    std::conditional_t<sizeof(T) == 1,
+      std::conditional_t<sizeof(T) == 1,
 #if defined(UINT8_MAX)
-                       uint8_t,
+                         uint8_t,
 #else
-                       unsigned char,
+                         unsigned char,
 #endif
-    std::conditional_t<sizeof(T) == 2,
+                         std::conditional_t<sizeof(T) == 2,
 #if defined(UINT16_MAX)
-                       uint16_t,
+                                            uint16_t,
 #else
-                       unsigned short,
+                                            unsigned short,
 #endif
-    std::conditional_t<sizeof(T) == 4,
+                                            std::conditional_t<sizeof(T) == 4,
 #if defined(UINT32_MAX)
-                       uint32_t,
+                                                               uint32_t,
 #else
-                       unsigned int,
+                                                               unsigned int,
 #endif
 #if defined(UINT64_MAX)
-                       uint64_t>>>;
+                                                               uint64_t>>>;
 #else
-                       unsigned long long>>>;
+                                                               unsigned long long>>>;
 #endif
 };
 
@@ -285,7 +283,7 @@ using hip_useReinterpretStore_t = typename hip_useReinterpretStore<T>::type;
  */
 template <typename T,
           std::enable_if_t<hip_useBuiltinLoad<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicLoad(T *acc)
+RAJA_INLINE __device__ T hip_atomicLoad(T* acc)
 {
 #if defined(__has_builtin) && __has_builtin(__hip_atomic_load)
   return __hip_atomic_load(acc, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
@@ -296,12 +294,12 @@ RAJA_INLINE __device__ T hip_atomicLoad(T *acc)
 
 template <typename T,
           std::enable_if_t<hip_useReinterpretLoad<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicLoad(T *acc)
+RAJA_INLINE __device__ T hip_atomicLoad(T* acc)
 {
   using R = hip_useReinterpretLoad_t<T>;
 
   return RAJA::util::reinterp_A_as_B<R, T>(
-    hip_atomicLoad(reinterpret_cast<R*>(acc)));
+      hip_atomicLoad(reinterpret_cast<R*>(acc)));
 }
 
 
@@ -310,7 +308,7 @@ RAJA_INLINE __device__ T hip_atomicLoad(T *acc)
  */
 template <typename T,
           std::enable_if_t<hip_useBuiltinStore<T>::value, bool> = true>
-RAJA_INLINE __device__ void hip_atomicStore(T *acc, T value)
+RAJA_INLINE __device__ void hip_atomicStore(T* acc, T value)
 {
 #if defined(__has_builtin) && __has_builtin(__hip_atomic_store)
   __hip_atomic_store(acc, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
@@ -321,7 +319,7 @@ RAJA_INLINE __device__ void hip_atomicStore(T *acc, T value)
 
 template <typename T,
           std::enable_if_t<hip_useReinterpretStore<T>::value, bool> = true>
-RAJA_INLINE __device__ void hip_atomicStore(T *acc, T value)
+RAJA_INLINE __device__ void hip_atomicStore(T* acc, T value)
 {
   using R = hip_useReinterpretStore_t<T>;
 
@@ -337,7 +335,7 @@ RAJA_INLINE __device__ void hip_atomicStore(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicCAS(T *acc, T compare, T value)
+RAJA_INLINE __device__ T hip_atomicCAS(T* acc, T compare, T value)
 {
   return ::atomicCAS(acc, compare, value);
 }
@@ -349,14 +347,13 @@ RAJA_INLINE __device__ T hip_atomicCAS(T *acc, T compare, T value)
  */
 template <typename T,
           std::enable_if_t<hip_useReinterpretCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicCAS(T *acc, T compare, T value)
+RAJA_INLINE __device__ T hip_atomicCAS(T* acc, T compare, T value)
 {
   using R = hip_useReinterpretCommon_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(
-    hip_atomicCAS(reinterpret_cast<R*>(acc),
-                  RAJA::util::reinterp_A_as_B<T, R>(compare),
-                  RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(hip_atomicCAS(
+      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(compare),
+      RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
@@ -390,15 +387,15 @@ RAJA_INLINE __device__ bool hip_atomicCAS_equal(const T& a, const T& b)
  * operation.
  */
 template <typename T, typename Oper>
-RAJA_INLINE __device__ T hip_atomicCAS_loop(T *acc,
-                                            Oper&& oper)
+RAJA_INLINE __device__ T hip_atomicCAS_loop(T* acc, Oper&& oper)
 {
   T old = hip_atomicLoad(acc);
   T expected;
 
-  do {
+  do
+  {
     expected = old;
-    old = hip_atomicCAS(acc, expected, oper(expected));
+    old      = hip_atomicCAS(acc, expected, oper(expected));
   } while (!hip_atomicCAS_equal(old, expected));
 
   return old;
@@ -406,27 +403,29 @@ RAJA_INLINE __device__ T hip_atomicCAS_loop(T *acc,
 
 
 /*!
- * Generic impementation of any atomic 32-bit or 64-bit operator with short-circuiting.
- * Implementation uses the existing HIP supplied unsigned 32-bit or 64-bit CAS
- * operator. Returns the OLD value that was replaced by the result of this
- * operation.
+ * Generic impementation of any atomic 32-bit or 64-bit operator with
+ * short-circuiting. Implementation uses the existing HIP supplied unsigned
+ * 32-bit or 64-bit CAS operator. Returns the OLD value that was replaced by the
+ * result of this operation.
  */
 template <typename T, typename Oper, typename ShortCircuit>
-RAJA_INLINE __device__ T hip_atomicCAS_loop(T *acc,
+RAJA_INLINE __device__ T hip_atomicCAS_loop(T* acc,
                                             Oper&& oper,
                                             ShortCircuit&& sc)
 {
   T old = hip_atomicLoad(acc);
 
-  if (sc(old)) {
+  if (sc(old))
+  {
     return old;
   }
 
   T expected;
 
-  do {
+  do
+  {
     expected = old;
-    old = hip_atomicCAS(acc, expected, oper(expected));
+    old      = hip_atomicCAS(acc, expected, oper(expected));
   } while (!hip_atomicCAS_equal(old, expected) && !sc(old));
 
   return old;
@@ -440,29 +439,28 @@ RAJA_INLINE __device__ T hip_atomicCAS_loop(T *acc,
 /*!
  * List of types where HIP builtin atomics are used to implement atomicAdd.
  */
-using hip_atomicAdd_builtin_types = ::camp::list<
-  int,
-  unsigned int,
-  unsigned long long,
-  float
+using hip_atomicAdd_builtin_types = ::camp::list<int,
+                                                 unsigned int,
+                                                 unsigned long long,
+                                                 float
 #ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
-  ,
-  double
+                                                 ,
+                                                 double
 #endif
->;
+                                                 >;
 
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicAdd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicAdd(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, hip_atomicAdd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicAdd(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
-    return old + value;
-  });
+  return hip_atomicCAS_loop(acc, [value](T old) { return old + value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicAdd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicAdd(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicAdd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicAdd(T* acc, T value)
 {
   return ::atomicAdd(acc, value);
 }
@@ -475,16 +473,15 @@ RAJA_INLINE __device__ T hip_atomicAdd(T *acc, T value)
 /*!
  * List of types where HIP builtin atomics are used to implement atomicSub.
  */
-using hip_atomicSub_builtin_types = ::camp::list<
-  int,
-  unsigned int,
-  unsigned long long,
-  float
+using hip_atomicSub_builtin_types = ::camp::list<int,
+                                                 unsigned int,
+                                                 unsigned long long,
+                                                 float
 #ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
-  ,
-  double
+                                                 ,
+                                                 double
 #endif
->;
+                                                 >;
 
 /*!
  * List of types where HIP builtin atomicSub is used to implement atomicSub.
@@ -492,10 +489,7 @@ using hip_atomicSub_builtin_types = ::camp::list<
  * Avoid multiple definition errors by including the previous list type here
  * to ensure these lists have different types.
  */
-using hip_atomicSub_via_Sub_builtin_types = ::camp::list<
-  int,
-  unsigned int
->;
+using hip_atomicSub_via_Sub_builtin_types = ::camp::list<int, unsigned int>;
 
 /*!
  * List of types where HIP builtin atomicAdd is used to implement atomicSub.
@@ -503,33 +497,33 @@ using hip_atomicSub_via_Sub_builtin_types = ::camp::list<
  * Avoid multiple definition errors by including the previous list type here
  * to ensure these lists have different types.
  */
-using hip_atomicSub_via_Add_builtin_types = ::camp::list<
-  unsigned long long,
-  float
+using hip_atomicSub_via_Add_builtin_types = ::camp::list<unsigned long long,
+                                                         float
 #ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
-  ,
-  double
+                                                         ,
+                                                         double
 #endif
->;
+                                                         >;
 
 /*!
  * HIP atomicSub compare and swap loop implementation.
  */
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicSub_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, hip_atomicSub_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicSub(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
-    return old - value;
-  });
+  return hip_atomicCAS_loop(acc, [value](T old) { return old - value; });
 }
 
 /*!
  * HIP atomicSub builtin implementation.
  */
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicSub_via_Sub_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicSub_via_Sub_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T hip_atomicSub(T* acc, T value)
 {
   return ::atomicSub(acc, value);
 }
@@ -537,9 +531,11 @@ RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
 /*!
  * HIP atomicSub via atomicAdd builtin implementation.
  */
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicSub_via_Add_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicSub_via_Add_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T hip_atomicSub(T* acc, T value)
 {
   return ::atomicAdd(acc, -value);
 }
@@ -550,23 +546,20 @@ RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
  */
 using hip_atomicMin_builtin_types = hip_atomicCommon_builtin_types;
 
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicMin_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicMin(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, hip_atomicMin_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicMin(T* acc, T value)
 {
   return hip_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return value < old ? value : old;
-    },
-    [value] (T current) {
-      return current <= value;
-    });
+      acc, [value](T old) { return value < old ? value : old; },
+      [value](T current) { return current <= value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicMin_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicMin(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicMin_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicMin(T* acc, T value)
 {
   return ::atomicMin(acc, value);
 }
@@ -577,23 +570,20 @@ RAJA_INLINE __device__ T hip_atomicMin(T *acc, T value)
  */
 using hip_atomicMax_builtin_types = hip_atomicCommon_builtin_types;
 
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicMax_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicMax(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, hip_atomicMax_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicMax(T* acc, T value)
 {
   return hip_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return old < value ? value : old;
-    },
-    [value] (T current) {
-      return value <= current;
-    });
+      acc, [value](T old) { return old < value ? value : old; },
+      [value](T current) { return value <= current; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicMax_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicMax(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicMax_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicMax(T* acc, T value)
 {
   return ::atomicMax(acc, value);
 }
@@ -603,11 +593,11 @@ RAJA_INLINE __device__ T hip_atomicMax(T *acc, T value)
  * Atomic increment with reset
  */
 template <typename T>
-RAJA_INLINE __device__ T hip_atomicInc(T *acc, T value)
+RAJA_INLINE __device__ T hip_atomicInc(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
-    return value <= old ? static_cast<T>(0) : old + static_cast<T>(1);
-  });
+  return hip_atomicCAS_loop(
+      acc, [value](T old)
+      { return value <= old ? static_cast<T>(0) : old + static_cast<T>(1); });
 }
 
 
@@ -615,7 +605,7 @@ RAJA_INLINE __device__ T hip_atomicInc(T *acc, T value)
  * Atomic increment (implemented in terms of atomic addition)
  */
 template <typename T>
-RAJA_INLINE __device__ T hip_atomicInc(T *acc)
+RAJA_INLINE __device__ T hip_atomicInc(T* acc)
 {
   return hip_atomicAdd(acc, static_cast<T>(1));
 }
@@ -625,11 +615,15 @@ RAJA_INLINE __device__ T hip_atomicInc(T *acc)
  * Atomic decrement with reset
  */
 template <typename T>
-RAJA_INLINE __device__ T hip_atomicDec(T *acc, T value)
+RAJA_INLINE __device__ T hip_atomicDec(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
-    return old == static_cast<T>(0) || value < old ? value : old - static_cast<T>(1);
-  });
+  return hip_atomicCAS_loop(acc,
+                            [value](T old)
+                            {
+                              return old == static_cast<T>(0) || value < old
+                                         ? value
+                                         : old - static_cast<T>(1);
+                            });
 }
 
 
@@ -637,7 +631,7 @@ RAJA_INLINE __device__ T hip_atomicDec(T *acc, T value)
  * Atomic decrement (implemented in terms of atomic subtraction)
  */
 template <typename T>
-RAJA_INLINE __device__ T hip_atomicDec(T *acc)
+RAJA_INLINE __device__ T hip_atomicDec(T* acc)
 {
   return hip_atomicSub(acc, static_cast<T>(1));
 }
@@ -648,18 +642,18 @@ RAJA_INLINE __device__ T hip_atomicDec(T *acc)
  */
 using hip_atomicAnd_builtin_types = hip_atomicCommon_builtin_types;
 
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicAnd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicAnd(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, hip_atomicAnd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicAnd(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
-    return old & value;
-  });
+  return hip_atomicCAS_loop(acc, [value](T old) { return old & value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicAnd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicAnd(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicAnd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicAnd(T* acc, T value)
 {
   return ::atomicAnd(acc, value);
 }
@@ -670,13 +664,12 @@ RAJA_INLINE __device__ T hip_atomicAnd(T *acc, T value)
  */
 using hip_atomicOr_builtin_types = hip_atomicCommon_builtin_types;
 
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicOr_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicOr(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, hip_atomicOr_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicOr(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
-    return old | value;
-  });
+  return hip_atomicCAS_loop(acc, [value](T old) { return old | value; });
 }
 
 /*!
@@ -690,18 +683,18 @@ RAJA_INLINE __device__ T hip_atomicOr(T *acc, T value)
  */
 using hip_atomicXor_builtin_types = hip_atomicCommon_builtin_types;
 
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicXor_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicXor(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, hip_atomicXor_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicXor(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
-    return old ^ value;
-  });
+  return hip_atomicCAS_loop(acc, [value](T old) { return old ^ value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicXor_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicXor(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicXor_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicXor(T* acc, T value)
 {
   return ::atomicXor(acc, value);
 }
@@ -721,181 +714,191 @@ RAJA_INLINE __device__ T hip_atomicXor(T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicLoad(hip_atomic_explicit<host_policy>, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(hip_atomic_explicit<host_policy>,
+                                          T* acc)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicLoad(acc);
 #else
-  return RAJA::atomicLoad(host_policy{}, acc);
+  return RAJA::atomicLoad(host_policy {}, acc);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE void
-atomicStore(hip_atomic_explicit<host_policy>, T *acc, T value)
+atomicStore(hip_atomic_explicit<host_policy>, T* acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   detail::hip_atomicStore(acc, value);
 #else
-  RAJA::atomicStore(host_policy{}, acc, value);
+  RAJA::atomicStore(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicAdd(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicAdd(acc, value);
 #else
-  return RAJA::atomicAdd(host_policy{}, acc, value);
+  return RAJA::atomicAdd(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicSub(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicSub(acc, value);
 #else
-  return RAJA::atomicSub(host_policy{}, acc, value);
+  return RAJA::atomicSub(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicMin(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicMin(acc, value);
 #else
-  return RAJA::atomicMin(host_policy{}, acc, value);
+  return RAJA::atomicMin(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicMax(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicMax(acc, value);
 #else
-  return RAJA::atomicMax(host_policy{}, acc, value);
+  return RAJA::atomicMax(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicInc(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicInc(acc, value);
 #else
-  return RAJA::atomicInc(host_policy{}, acc, value);
+  return RAJA::atomicInc(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicInc(hip_atomic_explicit<host_policy>, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(hip_atomic_explicit<host_policy>,
+                                         T* acc)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicInc(acc);
 #else
-  return RAJA::atomicInc(host_policy{}, acc);
+  return RAJA::atomicInc(host_policy {}, acc);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicDec(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicDec(acc, value);
 #else
-  return RAJA::atomicDec(host_policy{}, acc, value);
+  return RAJA::atomicDec(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicDec(hip_atomic_explicit<host_policy>, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(hip_atomic_explicit<host_policy>,
+                                         T* acc)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicDec(acc);
 #else
-  return RAJA::atomicDec(host_policy{}, acc);
+  return RAJA::atomicDec(host_policy {}, acc);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicAnd(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicAnd(acc, value);
 #else
-  return RAJA::atomicAnd(host_policy{}, acc, value);
+  return RAJA::atomicAnd(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicOr(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(hip_atomic_explicit<host_policy>,
+                                        T* acc,
+                                        T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicOr(acc, value);
 #else
-  return RAJA::atomicOr(host_policy{}, acc, value);
+  return RAJA::atomicOr(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicXor(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicXor(acc, value);
 #else
-  return RAJA::atomicXor(host_policy{}, acc, value);
+  return RAJA::atomicXor(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicExchange(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(hip_atomic_explicit<host_policy>,
+                                              T* acc,
+                                              T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicExchange(acc, value);
 #else
-  return RAJA::atomicExchange(host_policy{}, acc, value);
+  return RAJA::atomicExchange(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicCAS(hip_atomic_explicit<host_policy>, T *acc, T compare, T value)
+atomicCAS(hip_atomic_explicit<host_policy>, T* acc, T compare, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicCAS(acc, compare, value);
 #else
-  return RAJA::atomicCAS(host_policy{}, acc, compare, value);
+  return RAJA::atomicCAS(host_policy {}, acc, compare, value);
 #endif
 }
 
diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp
index a8c4cf53b9..14b8b5abf9 100644
--- a/include/RAJA/policy/hip/forall.hpp
+++ b/include/RAJA/policy/hip/forall.hpp
@@ -71,61 +71,91 @@ namespace impl
  *
  ******************************************************************************
  */
-template<typename IterationMapping, typename IterationGetter, typename Concretizer, typename UniqueMarker>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          typename UniqueMarker>
 struct ForallDimensionCalculator;
 
 // The general cases handle fixed BLOCK_SIZE > 0 and/or GRID_SIZE > 0
 // there are specializations for named_usage::unspecified
 // but named_usage::ignored is not supported so no specializations are provided
 // and static_asserts in the general case catch unsupported values
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or named_usage::unspecified with forall");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or named_usage::unspecified with forall");
 
   using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* RAJA_UNUSED_ARG(func),
+                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
 
-    if ( len > (block_size * grid_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > (block_size * grid_size))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
 
-    internal::set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexGetter::block_size));
-    internal::set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexGetter::grid_size));
+    internal::set_hip_dim<dim>(dims.threads,
+                               static_cast<IdxT>(IndexGetter::block_size));
+    internal::set_hip_dim<dim>(dims.blocks,
+                               static_cast<IdxT>(IndexGetter::grid_size));
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
     const IdxT block_size = concretizer.get_block_size_to_fit_len(grid_size);
 
-    if ( block_size == IdxT(0) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (block_size == IdxT(0))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
 
     internal::set_hip_dim<dim>(dims.threads, block_size);
@@ -133,43 +163,59 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = concretizer.get_grid_size_to_fit_len(block_size);
+    const IdxT grid_size  = concretizer.get_grid_size_to_fit_len(block_size);
 
     internal::set_hip_dim<dim>(dims.threads, block_size);
     internal::set_hip_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::hip::
+        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  using IndexGetter = ::RAJA::hip::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
     const auto sizes = concretizer.get_block_and_grid_size_to_fit_len();
 
@@ -178,46 +224,67 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or named_usage::unspecified with forall");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or named_usage::unspecified with forall");
 
   using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT RAJA_UNUSED_ARG(len),
-                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT RAJA_UNUSED_ARG(len),
+                             const void* RAJA_UNUSED_ARG(func),
+                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
 
     internal::set_hip_dim<dim>(dims.threads, block_size);
     internal::set_hip_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
     const IdxT block_size = concretizer.get_block_size_to_fit_device(grid_size);
 
     internal::set_hip_dim<dim>(dims.threads, block_size);
@@ -225,43 +292,59 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_us
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = concretizer.get_grid_size_to_fit_device(block_size);
+    const IdxT grid_size  = concretizer.get_grid_size_to_fit_device(block_size);
 
     internal::set_hip_dim<dim>(dims.threads, block_size);
     internal::set_hip_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::hip::
+        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  using IndexGetter = ::RAJA::hip::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
     const auto sizes = concretizer.get_block_and_grid_size_to_fit_device();
 
@@ -290,21 +373,22 @@ template <typename EXEC_POL,
           typename LOOP_BODY,
           typename IndexType,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size > 0),
+                           size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-void forall_hip_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+    void forall_hip_kernel(LOOP_BODY loop_body,
+                           const Iterator idx,
+                           IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
-  if (ii < length) {
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
+  if (ii < length)
+  {
     body(idx[ii]);
   }
 }
@@ -314,21 +398,20 @@ template <typename EXEC_POL,
           typename LOOP_BODY,
           typename IndexType,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forall_hip_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size <= 0),
+                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void
+forall_hip_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
-  if (ii < length) {
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
+  if (ii < length)
+  {
     body(idx[ii]);
   }
 }
@@ -339,23 +422,24 @@ template <typename EXEC_POL,
           typename IndexType,
           typename ForallParam,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size > 0),
+                           size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-void forallp_hip_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+    void forallp_hip_kernel(LOOP_BODY loop_body,
+                            const Iterator idx,
+                            IndexType length,
+                            ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
-  if ( ii < length ) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
+  if (ii < length)
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
@@ -366,134 +450,139 @@ template <typename EXEC_POL,
           typename IndexType,
           typename ForallParam,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forallp_hip_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size <= 0),
+                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forallp_hip_kernel(LOOP_BODY loop_body,
+                                   const Iterator idx,
+                                   IndexType length,
+                                   ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
-  if ( ii < length ) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
+  if (ii < length)
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 
-template <typename EXEC_POL,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+template <
+    typename EXEC_POL,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename IterationMapping          = typename EXEC_POL::IterationMapping,
+    typename IterationGetter           = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size > 0),
+                     size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-void forall_hip_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+    void forall_hip_kernel(LOOP_BODY loop_body,
+                           const Iterator idx,
+                           IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
+  auto& body      = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
     body(idx[ii]);
   }
 }
 ///
-template <typename EXEC_POL,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forall_hip_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+template <
+    typename EXEC_POL,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter  = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size <= 0),
+                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void
+forall_hip_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
+  auto& body      = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
     body(idx[ii]);
   }
 }
 
 ///
-template <typename EXEC_POL,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename ForallParam,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+template <
+    typename EXEC_POL,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename ForallParam,
+    typename IterationMapping          = typename EXEC_POL::IterationMapping,
+    typename IterationGetter           = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size > 0),
+                     size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-void forallp_hip_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+    void forallp_hip_kernel(LOOP_BODY loop_body,
+                            const Iterator idx,
+                            IndexType length,
+                            ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  auto& body      = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 ///
-template <typename EXEC_POL,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename ForallParam,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forallp_hip_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+template <
+    typename EXEC_POL,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename ForallParam,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter  = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size <= 0),
+                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forallp_hip_kernel(LOOP_BODY loop_body,
+                                   const Iterator idx,
+                                   IndexType length,
+                                   ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  auto& body      = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
@@ -508,37 +597,48 @@ void forallp_hip_kernel(LOOP_BODY loop_body,
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable, typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
+template <typename Iterable,
+          typename LoopBody,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
           typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Hip>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(resources::Hip hip_res,
-            ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>const&,
-            Iterable&& iter,
-            LoopBody&& loop_body,
-            ForallParam)
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+forall_impl(
+    resources::Hip hip_res,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async> const&,
+    Iterable&& iter,
+    LoopBody&& loop_body,
+    ForallParam)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>;
-  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using EXEC_POL =
+      ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter,
+                                    Concretizer, Async>;
+  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter,
+                                    LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator =
+      impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
+                                      Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0) {
+  if (len > 0)
+  {
 
     auto func = reinterpret_cast<const void*>(
         &impl::forall_hip_kernel<EXEC_POL, Iterator, LOOP_BODY, IndexType>);
@@ -560,14 +660,16 @@ forall_impl(resources::Hip hip_res,
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body = RAJA::hip::make_launch_body(func,
-          dims.blocks, dims.threads, shmem, hip_res, std::forward<LoopBody>(loop_body));
+      LOOP_BODY body = RAJA::hip::make_launch_body(
+          func, dims.blocks, dims.threads, shmem, hip_res,
+          std::forward<LoopBody>(loop_body));
 
       //
       // Launch the kernels
       //
-      void *args[] = {(void*)&body, (void*)&begin, (void*)&len};
-      RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res, Async);
+      void* args[] = {(void*)&body, (void*)&begin, (void*)&len};
+      RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res,
+                        Async);
     }
 
     RAJA_FT_END;
@@ -577,37 +679,49 @@ forall_impl(resources::Hip hip_res,
 }
 
 
-template <typename Iterable, typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
+template <typename Iterable,
+          typename LoopBody,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
           typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Hip>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
-forall_impl(resources::Hip hip_res,
-            ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> const&,
-            Iterable&& iter,
-            LoopBody&& loop_body,
-            ForallParam f_params)
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+forall_impl(
+    resources::Hip hip_res,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async> const&,
+    Iterable&& iter,
+    LoopBody&& loop_body,
+    ForallParam f_params)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>;
-  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using EXEC_POL =
+      ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter,
+                                    Concretizer, Async>;
+  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter,
+                                    LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator =
+      impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
+                                      Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0) {
+  if (len > 0)
+  {
 
     auto func = reinterpret_cast<const void*>(
         &impl::forallp_hip_kernel<EXEC_POL, Iterator, LOOP_BODY, IndexType,
@@ -627,9 +741,9 @@ forall_impl(resources::Hip hip_res,
     RAJA_FT_BEGIN;
 
     RAJA::hip::detail::hipInfo launch_info;
-    launch_info.gridDim = dims.blocks;
+    launch_info.gridDim  = dims.blocks;
     launch_info.blockDim = dims.threads;
-    launch_info.res = hip_res;
+    launch_info.res      = hip_res;
 
     {
       RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params, launch_info);
@@ -637,14 +751,17 @@ forall_impl(resources::Hip hip_res,
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body = RAJA::hip::make_launch_body(func,
-          dims.blocks, dims.threads, shmem, hip_res, std::forward<LoopBody>(loop_body));
+      LOOP_BODY body = RAJA::hip::make_launch_body(
+          func, dims.blocks, dims.threads, shmem, hip_res,
+          std::forward<LoopBody>(loop_body));
 
       //
       // Launch the kernels
       //
-      void *args[] = {(void*)&body, (void*)&begin, (void*)&len, (void*)&f_params};
-      RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res, Async);
+      void* args[] = {(void*)&body, (void*)&begin, (void*)&len,
+                      (void*)&f_params};
+      RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res,
+                        Async);
 
       RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params, launch_info);
     }
@@ -675,22 +792,28 @@ forall_impl(resources::Hip hip_res,
  ******************************************************************************
  */
 template <typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
           typename... SegmentTypes>
-RAJA_INLINE resources::EventProxy<resources::Hip>
-forall_impl(resources::Hip r,
-            ExecPolicy<seq_segit, ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>,
-            const TypedIndexSet<SegmentTypes...>& iset,
-            LoopBody&& loop_body)
+RAJA_INLINE resources::EventProxy<resources::Hip> forall_impl(
+    resources::Hip r,
+    ExecPolicy<
+        seq_segit,
+        ::RAJA::policy::hip::
+            hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>,
+    const TypedIndexSet<SegmentTypes...>& iset,
+    LoopBody&& loop_body)
 {
   int num_seg = iset.getNumSegments();
-  for (int isi = 0; isi < num_seg; ++isi) {
-    iset.segmentCall(r,
-                     isi,
-                     detail::CallForall(),
-                     ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, true>(),
-                     loop_body);
+  for (int isi = 0; isi < num_seg; ++isi)
+  {
+    iset.segmentCall(
+        r, isi, detail::CallForall(),
+        ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter,
+                                      Concretizer, true>(),
+        loop_body);
   }  // iterate over segments of index set
 
   if (!Async) RAJA::hip::synchronize(r);
diff --git a/include/RAJA/policy/hip/intrinsics.hpp b/include/RAJA/policy/hip/intrinsics.hpp
index c72a0b5c4f..67a7143ead 100644
--- a/include/RAJA/policy/hip/intrinsics.hpp
+++ b/include/RAJA/policy/hip/intrinsics.hpp
@@ -59,15 +59,9 @@ namespace impl
  */
 struct AccessorDeviceScopeUseDeviceFence : RAJA::detail::DefaultAccessor
 {
-  static RAJA_DEVICE RAJA_INLINE void fence_acquire()
-  {
-    __threadfence();
-  }
+  static RAJA_DEVICE RAJA_INLINE void fence_acquire() { __threadfence(); }
 
-  static RAJA_DEVICE RAJA_INLINE void fence_release()
-  {
-    __threadfence();
-  }
+  static RAJA_DEVICE RAJA_INLINE void fence_release() { __threadfence(); }
 };
 
 /*!
@@ -96,18 +90,23 @@ struct AccessorDeviceScopeUseBlockFence
   static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int);
   static constexpr size_t max_atomic_int_type_size = sizeof(unsigned long long);
 
-  template < typename T >
+  template <typename T>
   static RAJA_DEVICE RAJA_INLINE T get(T* in_ptr, size_t idx)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
+                                                   max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
-    auto ptr = const_cast<integer_type*>(reinterpret_cast<const integer_type*>(in_ptr + idx));
-
-    for (size_t i = 0; i < u.array_size(); ++i) {
-#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_load)
-      u.array[i] = __hip_atomic_load(&ptr[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    auto ptr = const_cast<integer_type*>(
+        reinterpret_cast<const integer_type*>(in_ptr + idx));
+
+    for (size_t i = 0; i < u.array_size(); ++i)
+    {
+#if defined(RAJA_USE_HIP_INTRINSICS) &&                                        \
+    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_load)
+      u.array[i] = __hip_atomic_load(&ptr[i], __ATOMIC_RELAXED,
+                                     __HIP_MEMORY_SCOPE_AGENT);
 #else
       u.array[i] = atomicAdd(&ptr[i], integer_type(0));
 #endif
@@ -116,19 +115,23 @@ struct AccessorDeviceScopeUseBlockFence
     return u.get_value();
   }
 
-  template < typename T >
+  template <typename T>
   static RAJA_DEVICE RAJA_INLINE void set(T* in_ptr, size_t idx, T val)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
+                                                   max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
     u.set_value(val);
     auto ptr = reinterpret_cast<integer_type*>(in_ptr + idx);
 
-    for (size_t i = 0; i < u.array_size(); ++i) {
-#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_store)
-      __hip_atomic_store(&ptr[i], u.array[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    for (size_t i = 0; i < u.array_size(); ++i)
+    {
+#if defined(RAJA_USE_HIP_INTRINSICS) &&                                        \
+    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_store)
+      __hip_atomic_store(&ptr[i], u.array[i], __ATOMIC_RELAXED,
+                         __HIP_MEMORY_SCOPE_AGENT);
 #else
       atomicExch(&ptr[i], u.array[i]);
 #endif
@@ -137,7 +140,8 @@ struct AccessorDeviceScopeUseBlockFence
 
   static RAJA_DEVICE RAJA_INLINE void fence_acquire()
   {
-#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence)
+#if defined(RAJA_USE_HIP_INTRINSICS) &&                                        \
+    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence)
     __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
 #else
     __threadfence();
@@ -146,11 +150,13 @@ struct AccessorDeviceScopeUseBlockFence
 
   static RAJA_DEVICE RAJA_INLINE void fence_release()
   {
-#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence) && \
-                                        RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_s_waitcnt)
+#if defined(RAJA_USE_HIP_INTRINSICS) &&                                        \
+    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence) &&                 \
+    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_s_waitcnt)
     __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
     // Wait until all vmem operations complete (s_waitcnt vmcnt(0))
-    __builtin_amdgcn_s_waitcnt(/*vmcnt*/ 0 | (/*exp_cnt*/ 0x7 << 4) | (/*lgkmcnt*/ 0xf << 8));
+    __builtin_amdgcn_s_waitcnt(/*vmcnt*/ 0 | (/*exp_cnt*/ 0x7 << 4) |
+                               (/*lgkmcnt*/ 0xf << 8));
 #else
     __threadfence();
 #endif
@@ -175,10 +181,13 @@ constexpr size_t max_shfl_int_type_size = sizeof(unsigned int);
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 {
-  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
+                               max_shfl_int_type_size>
+      u;
   u.set_value(var);
 
-  for (size_t i = 0; i < u.array_size(); ++i) {
+  for (size_t i = 0; i < u.array_size(); ++i)
+  {
     u.array[i] = ::__shfl_xor(u.array[i], laneMask);
   }
   return u.get_value();
@@ -187,10 +196,13 @@ RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane)
 {
-  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
+                               max_shfl_int_type_size>
+      u;
   u.set_value(var);
 
-  for (size_t i = 0; i < u.array_size(); ++i) {
+  for (size_t i = 0; i < u.array_size(); ++i)
+  {
     u.array[i] = ::__shfl(u.array[i], srcLane);
   }
   return u.get_value();
@@ -233,23 +245,28 @@ RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
 
   T temp = val;
 
-  if (numThreads % policy::hip::device_constants.WARP_SIZE == 0) {
+  if (numThreads % policy::hip::device_constants.WARP_SIZE == 0)
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
+    {
       T rhs = shfl_xor_sync(temp, i);
-      Combiner{}(temp, rhs);
+      Combiner {}(temp, rhs);
     }
-
-  } else {
+  }
+  else
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
+    {
       int srcLane = threadId ^ i;
-      T rhs = shfl_sync(temp, srcLane);
+      T rhs       = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
-        Combiner{}(temp, rhs);
+      if (srcLane < numThreads)
+      {
+        Combiner {}(temp, rhs);
       }
     }
   }
@@ -269,9 +286,10 @@ RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val)
 {
   T temp = val;
 
-  for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
+  for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
+  {
     T rhs = shfl_xor_sync(temp, i);
-    Combiner{}(temp, rhs);
+    Combiner {}(temp, rhs);
   }
 
   return temp;
@@ -287,61 +305,77 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
 
-  int warpId = threadId % policy::hip::device_constants.WARP_SIZE;
+  int warpId  = threadId % policy::hip::device_constants.WARP_SIZE;
   int warpNum = threadId / policy::hip::device_constants.WARP_SIZE;
 
   T temp = val;
 
-  if (numThreads % policy::hip::device_constants.WARP_SIZE == 0) {
+  if (numThreads % policy::hip::device_constants.WARP_SIZE == 0)
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
+    {
       T rhs = shfl_xor_sync(temp, i);
-      Combiner{}(temp, rhs);
+      Combiner {}(temp, rhs);
     }
-
-  } else {
+  }
+  else
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
+    {
       int srcLane = threadId ^ i;
-      T rhs = shfl_sync(temp, srcLane);
+      T rhs       = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
-        Combiner{}(temp, rhs);
+      if (srcLane < numThreads)
+      {
+        Combiner {}(temp, rhs);
       }
     }
   }
 
   // reduce per warp values
-  if (numThreads > policy::hip::device_constants.WARP_SIZE) {
+  if (numThreads > policy::hip::device_constants.WARP_SIZE)
+  {
 
-    static_assert(policy::hip::device_constants.MAX_WARPS <= policy::hip::device_constants.WARP_SIZE,
-        "This algorithms assumes a warp of WARP_SIZE threads can reduce MAX_WARPS values");
+    static_assert(policy::hip::device_constants.MAX_WARPS <=
+                      policy::hip::device_constants.WARP_SIZE,
+                  "This algorithms assumes a warp of WARP_SIZE threads can "
+                  "reduce MAX_WARPS values");
 
-    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS>)];
+    __shared__ unsigned char tmpsd[sizeof(
+        RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS>)];
     RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS>* sd =
-      reinterpret_cast<RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS> *>(tmpsd);
+        reinterpret_cast<RAJA::detail::SoAArray<
+            T, policy::hip::device_constants.MAX_WARPS>*>(tmpsd);
 
     // write per warp values to shared memory
-    if (warpId == 0) {
+    if (warpId == 0)
+    {
       sd->set(warpNum, temp);
     }
 
     __syncthreads();
 
-    if (warpNum == 0) {
+    if (warpNum == 0)
+    {
 
       // read per warp values
-      if (warpId * policy::hip::device_constants.WARP_SIZE < numThreads) {
+      if (warpId * policy::hip::device_constants.WARP_SIZE < numThreads)
+      {
         temp = sd->get(warpId);
-      } else {
+      }
+      else
+      {
         temp = identity;
       }
 
-      for (int i = 1; i < policy::hip::device_constants.MAX_WARPS; i *= 2) {
+      for (int i = 1; i < policy::hip::device_constants.MAX_WARPS; i *= 2)
+      {
         T rhs = shfl_xor_sync(temp, i);
-        Combiner{}(temp, rhs);
+        Combiner {}(temp, rhs);
       }
     }
 
diff --git a/include/RAJA/policy/hip/kernel/Conditional.hpp b/include/RAJA/policy/hip/kernel/Conditional.hpp
index 3204845544..a882b547d7 100644
--- a/include/RAJA/policy/hip/kernel/Conditional.hpp
+++ b/include/RAJA/policy/hip/kernel/Conditional.hpp
@@ -41,19 +41,18 @@ template <typename Data,
           typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<Data,
-                             statement::If<Conditional, EnclosedStmts...>,
-                             Types> {
+                            statement::If<Conditional, EnclosedStmts...>,
+                            Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
-    if (Conditional::eval(data)) {
+    if (Conditional::eval(data))
+    {
 
       // execute enclosed statements
       enclosed_stmts_t::exec(data, thread_active);
@@ -61,10 +60,7 @@ struct HipStatementExecutor<Data,
   }
 
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
diff --git a/include/RAJA/policy/hip/kernel/For.hpp b/include/RAJA/policy/hip/kernel/For.hpp
index 39e7104c16..6e90852841 100644
--- a/include/RAJA/policy/hip/kernel/For.hpp
+++ b/include/RAJA/policy/hip/kernel/For.hpp
@@ -45,9 +45,12 @@ template <typename Data,
 struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                   RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                  sync,
+                                                  IndexMapper>,
                    EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -60,13 +63,13 @@ struct HipStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
+      RAJA::policy::hip::
+          hip_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -79,14 +82,13 @@ struct HipStatementExecutor<
     enclosed_stmts_t::exec(data, thread_active && have_work);
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
     HipDims my_dims(0), my_min_dims(0);
     DimensionCalculator::set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    LaunchDims dims {my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -108,9 +110,13 @@ template <typename Data,
 struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
+                   RAJA::policy::hip::hip_indexer<
+                       iteration_mapping::StridedLoop<named_usage::unspecified>,
+                       kernel_sync_requirement::sync,
+                       IndexMapper>,
                    EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -122,21 +128,24 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
+  using DimensionCalculator =
+      RAJA::internal::KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::sync,
+          IndexMapper>>;
 
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -151,14 +160,13 @@ struct HipStatementExecutor<
     }
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     diff_t len = segment_length<ArgumentId>(data);
 
     HipDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -180,9 +188,13 @@ template <typename Data,
 struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
+                   RAJA::policy::hip::hip_indexer<
+                       iteration_mapping::StridedLoop<named_usage::unspecified>,
+                       kernel_sync_requirement::none,
+                       IndexMapper>,
                    EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -194,21 +206,24 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
+  using DimensionCalculator =
+      RAJA::internal::KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::none,
+          IndexMapper>>;
 
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through one at a time
     // threads will have different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign the index to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -218,14 +233,13 @@ struct HipStatementExecutor<
     }
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
     HipDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -245,14 +259,19 @@ struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
     Types>
-: HipStatementExecutor<Data, statement::For<ArgumentId,
-      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                     kernel_sync_requirement::none,
-                                     hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-      EnclosedStmts...>, Types>
-{
-
-};
+    : HipStatementExecutor<
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  hip::IndexGlobal<named_dim::x,
+                                   named_usage::ignored,
+                                   named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
 
 
 /*
@@ -263,33 +282,32 @@ struct HipStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct HipStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::hip_warp_masked_direct<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+struct HipStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::hip_warp_masked_direct<Mask>,
+                                           EnclosedStmts...>,
+                            Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -299,13 +317,11 @@ struct HipStatementExecutor<
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
@@ -320,7 +336,7 @@ struct HipStatementExecutor<
     // since we are direct-mapping, we REQUIRE len
     set_hip_dim<named_dim::x>(dims.min_dims.threads, len);
 
-    return(dims);
+    return (dims);
   }
 };
 
@@ -332,41 +348,41 @@ struct HipStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct HipStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::hip_warp_masked_loop<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+struct HipStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::hip_warp_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -382,9 +398,7 @@ struct HipStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
@@ -399,7 +413,7 @@ struct HipStatementExecutor<
     // since we are direct-mapping, we REQUIRE len
     set_hip_dim<named_dim::x>(dims.min_dims.threads, len);
 
-    return(dims);
+    return (dims);
   }
 };
 
@@ -411,30 +425,28 @@ struct HipStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct HipStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::hip_thread_masked_direct<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+struct HipStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::hip_thread_masked_direct<Mask>,
+                                           EnclosedStmts...>,
+                            Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -444,13 +456,11 @@ struct HipStatementExecutor<
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims;
@@ -466,7 +476,7 @@ struct HipStatementExecutor<
     set_hip_dim<named_dim::x>(dims.min_dims.threads, len);
 
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
@@ -478,39 +488,38 @@ struct HipStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct HipStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::hip_thread_masked_loop<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+struct HipStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::hip_thread_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -526,9 +535,7 @@ struct HipStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims;
@@ -544,7 +551,7 @@ struct HipStatementExecutor<
     set_hip_dim<named_dim::x>(dims.min_dims.threads, len);
 
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
diff --git a/include/RAJA/policy/hip/kernel/ForICount.hpp b/include/RAJA/policy/hip/kernel/ForICount.hpp
index ba6642f248..823f6b1293 100644
--- a/include/RAJA/policy/hip/kernel/ForICount.hpp
+++ b/include/RAJA/policy/hip/kernel/ForICount.hpp
@@ -46,33 +46,40 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::hip::
+            hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : HipStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::hip::
+                  hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = HipStatementExecutor<
       Data,
       statement::For<ArgumentId,
-                     RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                     RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    IndexMapper>,
                      EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -102,38 +109,52 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : public HipStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::sync,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = HipStatementExecutor<
       Data,
-      statement::For<ArgumentId,
-                     RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                     EnclosedStmts...>,
+      statement::For<
+          ArgumentId,
+          RAJA::policy::hip::hip_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::sync,
+              IndexMapper>,
+          EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -165,38 +186,52 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : public HipStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = HipStatementExecutor<
       Data,
-      statement::For<ArgumentId,
-                     RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                     EnclosedStmts...>,
+      statement::For<
+          ArgumentId,
+          RAJA::policy::hip::hip_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::none,
+              IndexMapper>,
+          EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign the index to the argument and param
       data.template assign_offset<ArgumentId>(i);
@@ -225,14 +260,19 @@ struct HipStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>,
     Types>
-: HipStatementExecutor<Data, statement::ForICount<ArgumentId,
-      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                     kernel_sync_requirement::none,
-                                     hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-      EnclosedStmts...>, Types>
-{
-
-};
+    : HipStatementExecutor<
+          Data,
+          statement::ForICount<
+              ArgumentId,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  hip::IndexGlobal<named_dim::x,
+                                   named_usage::ignored,
+                                   named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
 
 
 /*
@@ -244,40 +284,47 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::hip_warp_masked_direct<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public HipStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::hip_warp_masked_direct<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = HipStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::hip_warp_masked_direct<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public HipStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::hip_warp_masked_direct<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::hip_warp_masked_direct<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      HipStatementExecutor<Data,
+                           statement::For<ArgumentId,
+                                          RAJA::hip_warp_masked_direct<Mask>,
+                                          EnclosedStmts...>,
+                           Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -288,9 +335,8 @@ struct HipStatementExecutor<
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
-
 };
 
 
@@ -303,48 +349,56 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::hip_warp_masked_loop<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public HipStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::hip_warp_masked_loop<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = HipStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::hip_warp_masked_loop<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public HipStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::hip_warp_masked_loop<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::hip_warp_masked_loop<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      HipStatementExecutor<Data,
+                           statement::For<ArgumentId,
+                                          RAJA::hip_warp_masked_loop<Mask>,
+                                          EnclosedStmts...>,
+                           Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -359,7 +413,6 @@ struct HipStatementExecutor<
       enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
-
 };
 
 
@@ -372,37 +425,43 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::hip_thread_masked_direct<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public HipStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::hip_thread_masked_direct<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = HipStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::hip_thread_masked_direct<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public HipStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::hip_thread_masked_direct<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::hip_thread_masked_direct<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      HipStatementExecutor<Data,
+                           statement::For<ArgumentId,
+                                          RAJA::hip_thread_masked_direct<Mask>,
+                                          EnclosedStmts...>,
+                           Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -413,9 +472,8 @@ struct HipStatementExecutor<
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
-
 };
 
 
@@ -428,45 +486,52 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::hip_thread_masked_loop<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public HipStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::hip_thread_masked_loop<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = HipStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::hip_thread_masked_loop<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public HipStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::hip_thread_masked_loop<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::hip_thread_masked_loop<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      HipStatementExecutor<Data,
+                           statement::For<ArgumentId,
+                                          RAJA::hip_thread_masked_loop<Mask>,
+                                          EnclosedStmts...>,
+                           Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -481,7 +546,6 @@ struct HipStatementExecutor<
       enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
-
 };
 
 }  // namespace internal
diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp
index 1ed7740008..8c5cb83d39 100644
--- a/include/RAJA/policy/hip/kernel/HipKernel.hpp
+++ b/include/RAJA/policy/hip/kernel/HipKernel.hpp
@@ -51,7 +51,8 @@ namespace RAJA
  *
  */
 template <bool async0, int num_blocks, int num_threads>
-struct hip_explicit_launch {};
+struct hip_explicit_launch
+{};
 
 /*!
  * HIP kernel launch policy where the user specifies the number of physical
@@ -87,8 +88,10 @@ namespace statement
  */
 template <typename LaunchConfig, typename... EnclosedStmts>
 struct HipKernelExt
-    : public internal::Statement<::RAJA::policy::hip::hip_exec<LaunchConfig, void, void, true>, EnclosedStmts...> {
-};
+    : public internal::Statement<
+          ::RAJA::policy::hip::hip_exec<LaunchConfig, void, void, true>,
+          EnclosedStmts...>
+{};
 
 
 /*!
@@ -99,7 +102,8 @@ struct HipKernelExt
  */
 template <int num_blocks, int num_threads, typename... EnclosedStmts>
 using HipKernelExp =
-    HipKernelExt<hip_explicit_launch<false, num_blocks, num_threads>, EnclosedStmts...>;
+    HipKernelExt<hip_explicit_launch<false, num_blocks, num_threads>,
+                 EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel with the flexibility
@@ -109,7 +113,8 @@ using HipKernelExp =
  */
 template <int num_blocks, int num_threads, typename... EnclosedStmts>
 using HipKernelExpAsync =
-    HipKernelExt<hip_explicit_launch<true, num_blocks, num_threads>, EnclosedStmts...>;
+    HipKernelExt<hip_explicit_launch<true, num_blocks, num_threads>,
+                 EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel using the
@@ -135,9 +140,9 @@ using HipKernelOccAsync =
  * The kernel launch is synchronous.
  */
 template <int num_threads, typename... EnclosedStmts>
-using HipKernelFixed =
-    HipKernelExt<hip_explicit_launch<false, operators::limits<int>::max(), num_threads>,
-                  EnclosedStmts...>;
+using HipKernelFixed = HipKernelExt<
+    hip_explicit_launch<false, operators::limits<int>::max(), num_threads>,
+    EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel with a fixed
@@ -145,8 +150,9 @@ using HipKernelFixed =
  * The kernel launch is asynchronous.
  */
 template <int num_threads, typename... EnclosedStmts>
-using HipKernelFixedAsync =
-    HipKernelExt<hip_explicit_launch<true, operators::limits<int>::max(), num_threads>, EnclosedStmts...>;
+using HipKernelFixedAsync = HipKernelExt<
+    hip_explicit_launch<true, operators::limits<int>::max(), num_threads>,
+    EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel with 1024 threads
@@ -175,7 +181,7 @@ template <typename Data, typename Exec>
 __global__ void HipKernelLauncher(Data data)
 {
 
-  using data_t = camp::decay<Data>;
+  using data_t        = camp::decay<Data>;
   data_t private_data = data;
 
   Exec::exec(private_data, true);
@@ -194,7 +200,7 @@ __launch_bounds__(BlockSize, 1) __global__
     void HipKernelLauncherFixed(Data data)
 {
 
-  using data_t = camp::decay<Data>;
+  using data_t        = camp::decay<Data>;
   data_t private_data = data;
 
   // execute the the object
@@ -210,10 +216,11 @@ __launch_bounds__(BlockSize, 1) __global__
  * The default case handles BlockSize != 0 and gets the fixed max block size
  * version of the kernel.
  */
-template<int BlockSize, typename Data, typename executor_t>
+template <int BlockSize, typename Data, typename executor_t>
 struct HipKernelLauncherGetter
 {
-  using type = camp::decay<decltype(&internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>)>;
+  using type = camp::decay<
+      decltype(&internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>)>;
   static constexpr type get() noexcept
   {
     return &internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>;
@@ -224,10 +231,11 @@ struct HipKernelLauncherGetter
  * Helper class specialization for BlockSize == 0 and gets the unfixed max
  * block size version of the kernel.
  */
-template<typename Data, typename executor_t>
+template <typename Data, typename executor_t>
 struct HipKernelLauncherGetter<0, Data, executor_t>
 {
-  using type = camp::decay<decltype(&internal::HipKernelLauncher<Data, executor_t>)>;
+  using type =
+      camp::decay<decltype(&internal::HipKernelLauncher<Data, executor_t>)>;
   static constexpr type get() noexcept
   {
     return &internal::HipKernelLauncher<Data, executor_t>;
@@ -235,12 +243,14 @@ struct HipKernelLauncherGetter<0, Data, executor_t>
 };
 
 
-
 /*!
  * Helper class that handles HIP kernel launching, and computing
  * maximum number of threads/blocks
  */
-template<typename LaunchPolicy, typename StmtList, typename Data, typename Types>
+template <typename LaunchPolicy,
+          typename StmtList,
+          typename Data,
+          typename Types>
 struct HipLaunchHelper;
 
 
@@ -249,16 +259,28 @@ struct HipLaunchHelper;
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the HIP occupancy calculator.
  */
-template<bool async0, int num_blocks, int num_threads, typename StmtList, typename Data, typename Types>
-struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,StmtList,Data,Types>
+template <bool async0,
+          int num_blocks,
+          int num_threads,
+          typename StmtList,
+          typename Data,
+          typename Types>
+struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,
+                       StmtList,
+                       Data,
+                       Types>
 {
   using Self = HipLaunchHelper;
 
   static constexpr bool async = async0;
 
-  using executor_t = internal::hip_statement_list_executor_t<StmtList, Data, Types>;
+  using executor_t =
+      internal::hip_statement_list_executor_t<StmtList, Data, Types>;
 
-  using kernelGetter_t = HipKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads, Data, executor_t>;
+  using kernelGetter_t =
+      HipKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads,
+                              Data,
+                              executor_t>;
 
   inline static const void* get_func()
   {
@@ -266,13 +288,16 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
   }
 
   inline static void recommended_blocks_threads(size_t shmem_size,
-      int &recommended_blocks, int &recommended_threads)
+                                                int& recommended_blocks,
+                                                int& recommended_threads)
   {
     auto func = Self::get_func();
 
-    if (num_blocks <= 0) {
+    if (num_blocks <= 0)
+    {
 
-      if (num_threads <= 0) {
+      if (num_threads <= 0)
+      {
 
         //
         // determine blocks at runtime
@@ -280,10 +305,11 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
         //
         auto data = ::RAJA::hip::hip_occupancy_max_blocks_threads<Self>(
             func, shmem_size);
-        recommended_blocks = data.func_max_blocks_per_device;
+        recommended_blocks  = data.func_max_blocks_per_device;
         recommended_threads = data.func_max_threads_per_block;
-
-      } else {
+      }
+      else
+      {
 
         //
         // determine blocks at runtime
@@ -293,69 +319,73 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
 
         auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self, num_threads>(
             func, shmem_size);
-        recommended_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-
+        recommended_blocks =
+            data.func_max_blocks_per_sm * data.device_sm_per_device;
       }
+    }
+    else
+    {
 
-    } else {
-
-      if (num_threads <= 0) {
+      if (num_threads <= 0)
+      {
 
         //
         // determine threads at runtime, unsure what use 1024
         // this value may be invalid for kernels with high register pressure
         //
         recommended_threads = 1024;
-
-      } else {
+      }
+      else
+      {
 
         //
         // threads determined at compile-time
         //
         recommended_threads = num_threads;
-
       }
 
       //
       // blocks determined at compile-time
       //
       recommended_blocks = num_blocks;
-
     }
   }
 
-  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size), int &max_threads)
+  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size),
+                                 int& max_threads)
   {
-    if (num_threads <= 0) {
+    if (num_threads <= 0)
+    {
 
       //
       // determine threads at runtime, unsure what use 1024
       // this value may be invalid for kernels with high register pressure
       //
       max_threads = 1024;
-
-    } else {
+    }
+    else
+    {
 
       //
       // threads determined at compile-time
       //
       max_threads = num_threads;
-
     }
   }
 
-  inline static void max_blocks(size_t shmem_size,
-      int &max_blocks, int actual_threads)
+  inline static void
+  max_blocks(size_t shmem_size, int& max_blocks, int actual_threads)
   {
     auto func = Self::get_func();
 
-    if (num_blocks <= 0) {
+    if (num_blocks <= 0)
+    {
 
       //
       // determine blocks at runtime
       //
-      if (num_threads <= 0 ||
-          num_threads != actual_threads) {
+      if (num_threads <= 0 || num_threads != actual_threads)
+      {
 
         //
         // determine blocks when actual_threads != num_threads
@@ -363,8 +393,9 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
         auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self>(
             func, shmem_size, actual_threads);
         max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-
-      } else {
+      }
+      else
+      {
 
         //
         // determine blocks when actual_threads == num_threads
@@ -372,16 +403,15 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
         auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self, num_threads>(
             func, shmem_size);
         max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-
       }
-
-    } else {
+    }
+    else
+    {
 
       //
       // blocks determined at compile-time
       //
       max_blocks = num_blocks;
-
     }
   }
 };
@@ -395,8 +425,10 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
  * The algorithm is greedy (and probably could be improved), and favors
  * maximizing the number of threads (or blocks) in x, y, then z.
  */
-inline
-hip_dim_t fitHipDims(hip_dim_member_t limit, hip_dim_t result, hip_dim_t minimum = hip_dim_t()){
+inline hip_dim_t fitHipDims(hip_dim_member_t limit,
+                            hip_dim_t result,
+                            hip_dim_t minimum = hip_dim_t())
+{
 
 
   // clamp things to at least 1
@@ -409,12 +441,13 @@ hip_dim_t fitHipDims(hip_dim_member_t limit, hip_dim_t result, hip_dim_t minimum
   minimum.z = minimum.z ? minimum.z : 1;
 
   // if we are under the limit, we're done
-  if(result.x * result.y * result.z <= limit) return result;
+  if (result.x * result.y * result.z <= limit) return result;
 
   // Can we reduce z to fit?
-  if(result.x * result.y * minimum.z < limit){
+  if (result.x * result.y * minimum.z < limit)
+  {
     // compute a new z
-    result.z = limit / (result.x*result.y);
+    result.z = limit / (result.x * result.y);
     return result;
   }
   // we don't fit, so reduce z to it's minimum and continue on to y
@@ -422,9 +455,10 @@ hip_dim_t fitHipDims(hip_dim_member_t limit, hip_dim_t result, hip_dim_t minimum
 
 
   // Can we reduce y to fit?
-  if(result.x * minimum.y * result.z < limit){
+  if (result.x * minimum.y * result.z < limit)
+  {
     // compute a new y
-    result.y = limit / (result.x*result.z);
+    result.y = limit / (result.x * result.z);
     return result;
   }
   // we don't fit, so reduce y to it's minimum and continue on to x
@@ -432,9 +466,10 @@ hip_dim_t fitHipDims(hip_dim_member_t limit, hip_dim_t result, hip_dim_t minimum
 
 
   // Can we reduce y to fit?
-  if(minimum.x * result.y * result.z < limit){
+  if (minimum.x * result.y * result.z < limit)
+  {
     // compute a new x
-    result.x = limit / (result.y*result.z);
+    result.x = limit / (result.y * result.z);
     return result;
   }
   // we don't fit, so we'll return the smallest possible thing
@@ -449,18 +484,20 @@ hip_dim_t fitHipDims(hip_dim_member_t limit, hip_dim_t result, hip_dim_t minimum
  */
 template <typename LaunchConfig, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::HipKernelExt<LaunchConfig, EnclosedStmts...>, Types> {
+    statement::HipKernelExt<LaunchConfig, EnclosedStmts...>,
+    Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
-  using StatementType =
-      statement::HipKernelExt<LaunchConfig, EnclosedStmts...>;
+  using stmt_list_t   = StatementList<EnclosedStmts...>;
+  using StatementType = statement::HipKernelExt<LaunchConfig, EnclosedStmts...>;
 
   template <typename Data>
-  static inline void exec(Data &&data)
+  static inline void exec(Data&& data)
   {
 
     using data_t = camp::decay<Data>;
-    using executor_t = hip_statement_list_executor_t<stmt_list_t, data_t, Types>;
+    using executor_t =
+        hip_statement_list_executor_t<stmt_list_t, data_t, Types>;
     using launch_t = HipLaunchHelper<LaunchConfig, stmt_list_t, data_t, Types>;
 
 
@@ -474,9 +511,10 @@ struct StatementExecutor<
 
 
     // Only launch kernel if we have something to iterate over
-    int num_blocks = launch_dims.num_blocks();
+    int num_blocks  = launch_dims.num_blocks();
     int num_threads = launch_dims.num_threads();
-    if (num_blocks > 0 || num_threads > 0) {
+    if (num_blocks > 0 || num_threads > 0)
+    {
 
       //
       // Setup shared memory buffers
@@ -489,8 +527,8 @@ struct StatementExecutor<
       //
       int recommended_blocks;
       int recommended_threads;
-      launch_t::recommended_blocks_threads(
-          shmem, recommended_blocks, recommended_threads);
+      launch_t::recommended_blocks_threads(shmem, recommended_blocks,
+                                           recommended_threads);
 
 
       //
@@ -503,24 +541,24 @@ struct StatementExecutor<
       //
       // Fit the requested threads
       //
-      hip_dim_t fit_threads{0,0,0};
+      hip_dim_t fit_threads {0, 0, 0};
 
-      if ( recommended_threads >= get_size(launch_dims.min_dims.threads) ) {
-
-        fit_threads = fitHipDims(
-            recommended_threads, launch_dims.dims.threads, launch_dims.min_dims.threads);
+      if (recommended_threads >= get_size(launch_dims.min_dims.threads))
+      {
 
+        fit_threads = fitHipDims(recommended_threads, launch_dims.dims.threads,
+                                 launch_dims.min_dims.threads);
       }
 
       //
       // Redo fit with max threads
       //
-      if ( recommended_threads < max_threads &&
-           get_size(fit_threads) != recommended_threads ) {
-
-        fit_threads = fitHipDims(
-            max_threads, launch_dims.dims.threads, launch_dims.min_dims.threads);
+      if (recommended_threads < max_threads &&
+          get_size(fit_threads) != recommended_threads)
+      {
 
+        fit_threads = fitHipDims(max_threads, launch_dims.dims.threads,
+                                 launch_dims.min_dims.threads);
       }
 
       launch_dims.dims.threads = fit_threads;
@@ -534,24 +572,25 @@ struct StatementExecutor<
 
       int use_blocks;
 
-      if ( launch_dims.num_threads() == recommended_threads ) {
+      if (launch_dims.num_threads() == recommended_threads)
+      {
 
         //
         // Fit the requested blocks
         //
         use_blocks = recommended_blocks;
-
-      } else {
+      }
+      else
+      {
 
         //
         // Fit the max blocks
         //
         use_blocks = max_blocks;
-
       }
 
-      launch_dims.dims.blocks = fitHipDims(
-          use_blocks, launch_dims.dims.blocks, launch_dims.min_dims.blocks);
+      launch_dims.dims.blocks = fitHipDims(use_blocks, launch_dims.dims.blocks,
+                                           launch_dims.min_dims.blocks);
 
       //
       // make sure that we fit
@@ -560,7 +599,8 @@ struct StatementExecutor<
       if(launch_dims.num_blocks() > max_blocks){
         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num blocks");
       }*/
-      if(launch_dims.num_threads() > max_threads){
+      if (launch_dims.num_threads() > max_threads)
+      {
         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num threads");
       }
 
@@ -574,14 +614,17 @@ struct StatementExecutor<
         // of the launch_dims and potential changes to shmem here that is
         // currently an unresolved issue.
         //
-        auto hip_data = RAJA::hip::make_launch_body(func,
-            launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res, data);
+        auto hip_data = RAJA::hip::make_launch_body(
+            func, launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res,
+            data);
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&hip_data};
-        RAJA::hip::launch(func, launch_dims.dims.blocks, launch_dims.dims.threads, args, shmem, res, launch_t::async);
+        void* args[] = {(void*)&hip_data};
+        RAJA::hip::launch(func, launch_dims.dims.blocks,
+                          launch_dims.dims.threads, args, shmem, res,
+                          launch_t::async);
       }
     }
   }
diff --git a/include/RAJA/policy/hip/kernel/Hyperplane.hpp b/include/RAJA/policy/hip/kernel/Hyperplane.hpp
index 5c428f03ab..a9888d17a7 100644
--- a/include/RAJA/policy/hip/kernel/Hyperplane.hpp
+++ b/include/RAJA/policy/hip/kernel/Hyperplane.hpp
@@ -41,33 +41,31 @@ template <typename Data,
           camp::idx_t... Args,
           typename... EnclosedStmts,
           typename Types>
-struct HipStatementExecutor<Data,
-                             statement::Hyperplane<HpArgumentId,
-                                                   seq_exec,
-                                                   ArgList<Args...>,
-                                                   EnclosedStmts...>,
-                            Types> {
+struct HipStatementExecutor<
+    Data,
+    statement::
+        Hyperplane<HpArgumentId, seq_exec, ArgList<Args...>, EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, HpArgumentId, Data>;
 
-  using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+  using enclosed_stmts_t =
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // compute Manhattan distance of iteration space to determine
     // as:  hp_len = l0 + l1 + l2 + ...
-    int hp_len = segment_length<HpArgumentId>(data) +
-                 foldl(RAJA::operators::plus<int>(),
-                               segment_length<Args>(data)...);
+    int hp_len =
+        segment_length<HpArgumentId>(data) +
+        foldl(RAJA::operators::plus<int>(), segment_length<Args>(data)...);
 
     int h_args = foldl(RAJA::operators::plus<idx_t>(),
-        camp::get<Args>(data.offset_tuple)...);
+                       camp::get<Args>(data.offset_tuple)...);
 
     // get length of i dimension
     auto i_len = segment_length<HpArgumentId>(data);
@@ -79,7 +77,8 @@ struct HipStatementExecutor<Data,
      * We reject the iterations that lie outside of the specified rectangular
      * region we are sweeping.
      */
-    for (int h = 0; h < hp_len; ++h) {
+    for (int h = 0; h < hp_len; ++h)
+    {
 
       // compute actual iterate for HpArgumentId
       // as:  i0 = h - (i1 + i2 + i3 + ...)
@@ -93,18 +92,13 @@ struct HipStatementExecutor<Data,
   }
 
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
 };
 
 
-
-
 }  // end namespace internal
 
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/hip/kernel/InitLocalMem.hpp b/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
index bbb8d6081b..b59ec5c88a 100644
--- a/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
+++ b/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
@@ -39,27 +39,30 @@ struct hip_shared_mem;
 namespace internal
 {
 
-//Intialize thread shared array
-template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
+// Intialize thread shared array
+template <typename Data,
+          camp::idx_t... Indices,
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<Data,
-                            statement::InitLocalMem<RAJA::hip_shared_mem, camp::idx_seq<Indices...>,
-                            EnclosedStmts...>,
+                            statement::InitLocalMem<RAJA::hip_shared_mem,
+                                                    camp::idx_seq<Indices...>,
+                                                    EnclosedStmts...>,
                             Types>
 {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  //Launch loops
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Launch loops
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     __shared__ varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -67,40 +70,33 @@ struct HipStatementExecutor<Data,
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-  //Intialize local array
-  //Identifies type + number of elements needed
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Intialize local array
+  // Identifies type + number of elements needed
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     __shared__ varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
     initMem<other0, others...>(data, thread_active);
   }
 
-  //Set pointer to null base case
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null base case
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
   }
 
 
-  //Set pointer to null recursive case
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null recursive case
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
@@ -108,47 +104,47 @@ struct HipStatementExecutor<Data,
   }
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
 
-    //Intialize scoped arrays + launch loops
+    // Intialize scoped arrays + launch loops
     initMem<Indices...>(data, thread_active);
 
-    //set pointers in scoped arrays to null
+    // set pointers in scoped arrays to null
     setPtrToNull<Indices...>(data);
   }
 
 
-  inline
-  static
-  LaunchDims calculateDimensions(Data const &data)
+  inline static LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
-
 };
 
-//Intialize thread private array
-template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
-struct HipStatementExecutor<Data, statement::InitLocalMem<RAJA::hip_thread_mem, camp::idx_seq<Indices...>, EnclosedStmts...>, Types>
+// Intialize thread private array
+template <typename Data,
+          camp::idx_t... Indices,
+          typename... EnclosedStmts,
+          typename Types>
+struct HipStatementExecutor<Data,
+                            statement::InitLocalMem<RAJA::hip_thread_mem,
+                                                    camp::idx_seq<Indices...>,
+                                                    EnclosedStmts...>,
+                            Types>
 {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  //Launch loops
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Launch loops
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -156,40 +152,33 @@ struct HipStatementExecutor<Data, statement::InitLocalMem<RAJA::hip_thread_mem,
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-  //Intialize local array
-  //Identifies type + number of elements needed
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Intialize local array
+  // Identifies type + number of elements needed
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
     initMem<other0, others...>(data, thread_active);
   }
 
-  //Set pointer to null base case
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null base case
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
   }
 
 
-  //Set pointer to null recursive case
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null recursive case
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
@@ -197,31 +186,24 @@ struct HipStatementExecutor<Data, statement::InitLocalMem<RAJA::hip_thread_mem,
   }
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
 
-    //Intialize scoped arrays + launch loops
+    // Intialize scoped arrays + launch loops
     initMem<Indices...>(data, thread_active);
 
-    //set pointers in scoped arrays to null
+    // set pointers in scoped arrays to null
     setPtrToNull<Indices...>(data);
   }
 
 
-  inline
-  static
-  LaunchDims calculateDimensions(Data const &data)
+  inline static LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
-
 };
 
 
-
 }  // namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/hip/kernel/Lambda.hpp b/include/RAJA/policy/hip/kernel/Lambda.hpp
index d04fb11bf6..7835ddb7eb 100644
--- a/include/RAJA/policy/hip/kernel/Lambda.hpp
+++ b/include/RAJA/policy/hip/kernel/Lambda.hpp
@@ -40,30 +40,34 @@ namespace RAJA
 namespace internal
 {
 
-template <typename Data, camp::idx_t LambdaIndex, typename... Args, typename Types>
-struct HipStatementExecutor<Data, statement::Lambda<LambdaIndex, Args...>, Types> {
+template <typename Data,
+          camp::idx_t LambdaIndex,
+          typename... Args,
+          typename Types>
+struct HipStatementExecutor<Data,
+                            statement::Lambda<LambdaIndex, Args...>,
+                            Types>
+{
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Only execute the lambda if it hasn't been masked off
-    if(thread_active){
-      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(data);
+    if (thread_active)
+    {
+      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(
+          data);
     }
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims
+  calculateDimensions(Data const& RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
 };
 
 
-
-
 }  // namespace internal
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/hip/kernel/Reduce.hpp b/include/RAJA/policy/hip/kernel/Reduce.hpp
index a518073e7c..2799207979 100644
--- a/include/RAJA/policy/hip/kernel/Reduce.hpp
+++ b/include/RAJA/policy/hip/kernel/Reduce.hpp
@@ -35,31 +35,34 @@ namespace internal
 // Executor that handles reductions across a single HIP thread block
 //
 template <typename Data,
-          template <typename...> class ReduceOperator,
+          template <typename...>
+          class ReduceOperator,
           typename ParamId,
           typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<Data,
-                             statement::Reduce<RAJA::hip_block_reduce,
-                                               ReduceOperator,
-                                               ParamId,
-                                               EnclosedStmts...>,
-                           Types> {
+                            statement::Reduce<RAJA::hip_block_reduce,
+                                              ReduceOperator,
+                                              ParamId,
+                                              EnclosedStmts...>,
+                            Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // block reduce on the specified parameter
-    auto value = data.template get_param<ParamId>();
+    auto value    = data.template get_param<ParamId>();
     using value_t = decltype(value);
     value_t ident = value_t();
 
     // if this thread isn't active, just set it to the identity
-    if (!thread_active) {
+    if (!thread_active)
+    {
       value = ident;
     }
 
@@ -67,13 +70,13 @@ struct HipStatementExecutor<Data,
     // reduction objects
     using combiner_t =
         RAJA::reduce::detail::op_adapter<value_t, ReduceOperator>;
-    value_t new_value =
-        RAJA::hip::impl::block_reduce<combiner_t>(value, ident);
+    value_t new_value = RAJA::hip::impl::block_reduce<combiner_t>(value, ident);
 
 
     // execute enclosed statements, and mask off everyone but thread 0
     thread_active = threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0;
-    if(thread_active){
+    if (thread_active)
+    {
       // Only update to new value on root thread
       data.template assign_param<ParamId>(new_value);
     }
@@ -81,7 +84,7 @@ struct HipStatementExecutor<Data,
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -94,44 +97,47 @@ struct HipStatementExecutor<Data,
 // Executor that handles reductions across a single HIP thread warp
 //
 template <typename Data,
-          template <typename...> class ReduceOperator,
+          template <typename...>
+          class ReduceOperator,
           typename ParamId,
           typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<Data,
-                             statement::Reduce<RAJA::hip_warp_reduce,
-                                               ReduceOperator,
-                                               ParamId,
-                                               EnclosedStmts...>,
-                            Types> {
+                            statement::Reduce<RAJA::hip_warp_reduce,
+                                              ReduceOperator,
+                                              ParamId,
+                                              EnclosedStmts...>,
+                            Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // block reduce on the specified parameter
-    auto value = data.template get_param<ParamId>();
+    auto value    = data.template get_param<ParamId>();
     using value_t = decltype(value);
     value_t ident = value_t();
 
     // if this thread isn't active, just set it to the identity
-    if (!thread_active) {
+    if (!thread_active)
+    {
       value = ident;
     }
 
     // Call warp reduction routine
     using combiner_t =
         RAJA::reduce::detail::op_adapter<value_t, ReduceOperator>;
-    value_t new_value =
-        RAJA::hip::impl::warp_reduce<combiner_t>(value, ident);
+    value_t new_value = RAJA::hip::impl::warp_reduce<combiner_t>(value, ident);
     data.template assign_param<ParamId>(new_value);
 
     // execute enclosed statements, and mask off everyone but lane 0
     thread_active = threadIdx.x == 0;
-    if(thread_active){
+    if (thread_active)
+    {
       // Only update to new value on root thread
       data.template assign_param<ParamId>(new_value);
     }
@@ -139,7 +145,7 @@ struct HipStatementExecutor<Data,
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -148,7 +154,6 @@ struct HipStatementExecutor<Data,
 };
 
 
-
 }  // namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/hip/kernel/Sync.hpp b/include/RAJA/policy/hip/kernel/Sync.hpp
index d54a5ccf83..b5d590cb3a 100644
--- a/include/RAJA/policy/hip/kernel/Sync.hpp
+++ b/include/RAJA/policy/hip/kernel/Sync.hpp
@@ -43,14 +43,14 @@ namespace statement
 /*!
  * A RAJA::kernel statement that performs a HIP __syncthreads().
  */
-struct HipSyncThreads : public internal::Statement<camp::nil> {
-};
+struct HipSyncThreads : public internal::Statement<camp::nil>
+{};
 
 /*!
  * A RAJA::kernel statement that performs a HIP __syncwarp().
  */
-struct HipSyncWarp : public internal::Statement<camp::nil> {
-};
+struct HipSyncWarp : public internal::Statement<camp::nil>
+{};
 
 }  // namespace statement
 
@@ -58,34 +58,31 @@ namespace internal
 {
 
 template <typename Data, typename Types>
-struct HipStatementExecutor<Data, statement::HipSyncThreads, Types> {
+struct HipStatementExecutor<Data, statement::HipSyncThreads, Types>
+{
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &, bool) { __syncthreads(); }
+  static inline RAJA_DEVICE void exec(Data&, bool) { __syncthreads(); }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims
+  calculateDimensions(Data const& RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
 };
 
 template <typename Data, typename Types>
-struct HipStatementExecutor<Data, statement::HipSyncWarp, Types> {
+struct HipStatementExecutor<Data, statement::HipSyncWarp, Types>
+{
 
-  static
-  inline
-  RAJA_DEVICE
-  //not currently supported
-  void exec(Data &, bool) {  }
+  static inline RAJA_DEVICE
+      // not currently supported
+      void
+      exec(Data&, bool)
+  {}
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims
+  calculateDimensions(Data const& RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
diff --git a/include/RAJA/policy/hip/kernel/Tile.hpp b/include/RAJA/policy/hip/kernel/Tile.hpp
index 62dda7f20d..4490bddf42 100644
--- a/include/RAJA/policy/hip/kernel/Tile.hpp
+++ b/include/RAJA/policy/hip/kernel/Tile.hpp
@@ -58,10 +58,12 @@ struct HipStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                    RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                   sync,
+                                                   IndexMapper>,
                     EnclosedStmts...>,
-                    Types>
-  {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -69,19 +71,21 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<
+      RAJA::policy::hip::
+          hip_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -100,23 +104,23 @@ struct HipStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     HipDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, static_cast<diff_t>(chunk_size));
@@ -141,11 +145,16 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                    EnclosedStmts...>, Types>
-  {
+    statement::Tile<
+        ArgumentId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -153,26 +162,32 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
+  using DimensionCalculator =
+      KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::sync,
+          IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -190,23 +205,23 @@ struct HipStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     HipDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -231,11 +246,16 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                    EnclosedStmts...>, Types>
-  {
+    statement::Tile<
+        ArgumentId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -243,26 +263,32 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
+  using DimensionCalculator =
+      KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::none,
+          IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
@@ -275,23 +301,23 @@ struct HipStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     HipDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -316,15 +342,22 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>
-: HipStatementExecutor<Data, statement::Tile<ArgumentId, TPol,
-    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                   kernel_sync_requirement::none,
-                                   hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-    EnclosedStmts...>, Types>
-{
-
-};
+    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
+    Types>
+    : HipStatementExecutor<
+          Data,
+          statement::Tile<
+              ArgumentId,
+              TPol,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  hip::IndexGlobal<named_dim::x,
+                                   named_usage::ignored,
+                                   named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
 
 }  // end namespace internal
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/hip/kernel/TileTCount.hpp b/include/RAJA/policy/hip/kernel/TileTCount.hpp
index 07637fbd8f..fc4a5c5222 100644
--- a/include/RAJA/policy/hip/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/hip/kernel/TileTCount.hpp
@@ -58,42 +58,49 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::hip::
+            hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public HipStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::hip::
+                  hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = HipStatementExecutor<
       Data,
       statement::Tile<ArgumentId,
                       RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                      RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                     sync,
+                                                     IndexMapper>,
                       EnclosedStmts...>,
-                      Types>;
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t t = IndexMapper::template index<diff_t>();
-    const diff_t i = t * static_cast<diff_t>(chunk_size);
+    const diff_t t   = IndexMapper::template index<diff_t>();
+    const diff_t i   = t * static_cast<diff_t>(chunk_size);
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -129,50 +136,64 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public HipStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::sync,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = HipStatementExecutor<
       Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                      EnclosedStmts...>,
-                      Types>;
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          RAJA::policy::hip::hip_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::sync,
+              IndexMapper>,
+          EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t t_init = IndexMapper::template index<diff_t>();
-    const diff_t i_init = t_init * static_cast<diff_t>(chunk_size);
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t t_init   = IndexMapper::template index<diff_t>();
+    const diff_t i_init   = t_init * static_cast<diff_t>(chunk_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(chunk_size);
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for(diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride) {
+    for (diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -207,50 +228,64 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public HipStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = HipStatementExecutor<
       Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                      EnclosedStmts...>,
-                      Types>;
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          RAJA::policy::hip::hip_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::none,
+              IndexMapper>,
+          EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t t_init = IndexMapper::template index<diff_t>();
-    const diff_t i_init = t_init * static_cast<diff_t>(chunk_size);
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t t_init   = IndexMapper::template index<diff_t>();
+    const diff_t i_init   = t_init * static_cast<diff_t>(chunk_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(chunk_size);
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for(diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
+    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
@@ -279,15 +314,24 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>, Types>
-: HipStatementExecutor<Data, statement::TileTCount<ArgumentId, ParamId, TPol,
-    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                   kernel_sync_requirement::none,
-                                   hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-    EnclosedStmts...>, Types>
-{
-
-};
+    statement::
+        TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>,
+    Types>
+    : HipStatementExecutor<
+          Data,
+          statement::TileTCount<
+              ArgumentId,
+              ParamId,
+              TPol,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  hip::IndexGlobal<named_dim::x,
+                                   named_usage::ignored,
+                                   named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
 
 }  // end namespace internal
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/hip/kernel/internal.hpp b/include/RAJA/policy/hip/kernel/internal.hpp
index aa0610d736..8b3793ae70 100644
--- a/include/RAJA/policy/hip/kernel/internal.hpp
+++ b/include/RAJA/policy/hip/kernel/internal.hpp
@@ -44,29 +44,26 @@ namespace RAJA
 namespace internal
 {
 
-struct LaunchDims {
+struct LaunchDims
+{
 
   HipDims dims;
   HipDims min_dims;
 
-  LaunchDims() = default;
-  LaunchDims(LaunchDims const&) = default;
+  LaunchDims()                             = default;
+  LaunchDims(LaunchDims const&)            = default;
   LaunchDims& operator=(LaunchDims const&) = default;
 
   RAJA_INLINE
-  LaunchDims(HipDims _dims)
-    : dims{_dims}
-    , min_dims{}
-  { }
+  LaunchDims(HipDims _dims) : dims {_dims}, min_dims {} {}
 
   RAJA_INLINE
   LaunchDims(HipDims _dims, HipDims _min_dims)
-    : dims{_dims}
-    , min_dims{_min_dims}
-  { }
+      : dims {_dims}, min_dims {_min_dims}
+  {}
 
   RAJA_INLINE
-  LaunchDims max(LaunchDims const &c) const
+  LaunchDims max(LaunchDims const& c) const
   {
     LaunchDims result;
 
@@ -82,43 +79,44 @@ struct LaunchDims {
     result.dims.threads.y = std::max(c.dims.threads.y, dims.threads.y);
     result.dims.threads.z = std::max(c.dims.threads.z, dims.threads.z);
 
-    result.min_dims.threads.x = std::max(c.min_dims.threads.x, min_dims.threads.x);
-    result.min_dims.threads.y = std::max(c.min_dims.threads.y, min_dims.threads.y);
-    result.min_dims.threads.z = std::max(c.min_dims.threads.z, min_dims.threads.z);
+    result.min_dims.threads.x =
+        std::max(c.min_dims.threads.x, min_dims.threads.x);
+    result.min_dims.threads.y =
+        std::max(c.min_dims.threads.y, min_dims.threads.y);
+    result.min_dims.threads.z =
+        std::max(c.min_dims.threads.z, min_dims.threads.z);
 
     return result;
   }
 
   RAJA_INLINE
-  int num_blocks() const {
-    return dims.num_blocks();
-  }
+  int num_blocks() const { return dims.num_blocks(); }
 
   RAJA_INLINE
-  int num_threads() const {
-    return dims.num_threads();
-  }
+  int num_threads() const { return dims.num_threads(); }
 
 
   RAJA_INLINE
-  void clamp_to_min_blocks() {
+  void clamp_to_min_blocks()
+  {
     dims.blocks.x = std::max(min_dims.blocks.x, dims.blocks.x);
     dims.blocks.y = std::max(min_dims.blocks.y, dims.blocks.y);
     dims.blocks.z = std::max(min_dims.blocks.z, dims.blocks.z);
   };
 
   RAJA_INLINE
-  void clamp_to_min_threads() {
+  void clamp_to_min_threads()
+  {
     dims.threads.x = std::max(min_dims.threads.x, dims.threads.x);
     dims.threads.y = std::max(min_dims.threads.y, dims.threads.y);
     dims.threads.z = std::max(min_dims.threads.z, dims.threads.z);
   };
-
 };
 
 
 template <camp::idx_t cur_stmt, camp::idx_t num_stmts, typename StmtList>
-struct HipStatementListExecutorHelper {
+struct HipStatementListExecutorHelper
+{
 
   using next_helper_t =
       HipStatementListExecutorHelper<cur_stmt + 1, num_stmts, StmtList>;
@@ -126,7 +124,7 @@ struct HipStatementListExecutorHelper {
   using cur_stmt_t = camp::at_v<StmtList, cur_stmt>;
 
   template <typename Data>
-  inline static RAJA_DEVICE void exec(Data &data, bool thread_active)
+  inline static RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Execute stmt
     cur_stmt_t::exec(data, thread_active);
@@ -137,7 +135,7 @@ struct HipStatementListExecutorHelper {
 
 
   template <typename Data>
-  inline static LaunchDims calculateDimensions(Data &data)
+  inline static LaunchDims calculateDimensions(Data& data)
   {
     // Compute this statements launch dimensions
     LaunchDims statement_dims = cur_stmt_t::calculateDimensions(data);
@@ -151,16 +149,17 @@ struct HipStatementListExecutorHelper {
 };
 
 template <camp::idx_t num_stmts, typename StmtList>
-struct HipStatementListExecutorHelper<num_stmts, num_stmts, StmtList> {
+struct HipStatementListExecutorHelper<num_stmts, num_stmts, StmtList>
+{
 
   template <typename Data>
-  inline static RAJA_DEVICE void exec(Data &, bool)
+  inline static RAJA_DEVICE void exec(Data&, bool)
   {
     // nop terminator
   }
 
   template <typename Data>
-  inline static LaunchDims calculateDimensions(Data &)
+  inline static LaunchDims calculateDimensions(Data&)
   {
     return LaunchDims();
   }
@@ -175,109 +174,120 @@ struct HipStatementListExecutor;
 
 
 template <typename Data, typename... Stmts, typename Types>
-struct HipStatementListExecutor<Data, StatementList<Stmts...>, Types> {
+struct HipStatementListExecutor<Data, StatementList<Stmts...>, Types>
+{
 
   using enclosed_stmts_t =
       camp::list<HipStatementExecutor<Data, Stmts, Types>...>;
 
   static constexpr size_t num_stmts = sizeof...(Stmts);
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Execute statements in order with helper class
-    HipStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(data, thread_active);
+    HipStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(
+        data, thread_active);
   }
 
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute this statements launch dimensions
-    return HipStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::
-        calculateDimensions(data);
+    return HipStatementListExecutorHelper<
+        0, num_stmts, enclosed_stmts_t>::calculateDimensions(data);
   }
 };
 
 
 template <typename StmtList, typename Data, typename Types>
-using hip_statement_list_executor_t = HipStatementListExecutor<
-    Data,
-    StmtList,
-    Types>;
+using hip_statement_list_executor_t =
+    HipStatementListExecutor<Data, StmtList, Types>;
 
 
 // specialization for direct sequential policies
-template<typename kernel_indexer>
+template <typename kernel_indexer>
 struct KernelDimensionCalculator;
 
 // specialization for direct sequential policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
 
-  template < typename IdxT >
-  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims), HipDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
+  template <typename IdxT>
+  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims),
+                             HipDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT len)
   {
-    if ( len > static_cast<IdxT>(1) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > static_cast<IdxT>(1))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
   }
 };
 
 // specialization for direct thread policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(len));
     set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(len));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    if ( len > static_cast<IdxT>(IndexMapper::block_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > static_cast<IdxT>(IndexMapper::block_size))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
 // specialization for direct block policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(len));
@@ -285,164 +295,219 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    if ( len > static_cast<IdxT>(IndexMapper::grid_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > static_cast<IdxT>(IndexMapper::grid_size))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 // specialization for direct global policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
-  template < typename IdxT >
-  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims), HipDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
+  template <typename IdxT>
+  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims),
+                             HipDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT len)
   {
-    if (len > static_cast<IdxT>(0)) {
+    if (len > static_cast<IdxT>(0))
+    {
       RAJA_ABORT_OR_THROW("must know one of block_size or grid_size");
     }
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
-    set_hip_dim<dim>(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
+    set_hip_dim<dim>(dims.threads,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::grid_size)));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_hip_dim<dim>(dims.blocks,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::block_size)));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    if ( len > (static_cast<IdxT>(IndexMapper::block_size) *
-                static_cast<IdxT>(IndexMapper::grid_size)) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > (static_cast<IdxT>(IndexMapper::block_size) *
+               static_cast<IdxT>(IndexMapper::grid_size)))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 
 // specialization for strided loop sequential policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
-
-  template < typename IdxT >
-  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims), HipDims& RAJA_UNUSED_ARG(min_dims), IdxT RAJA_UNUSED_ARG(len))
-  {
-  }
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
+
+  template <typename IdxT>
+  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims),
+                             HipDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT RAJA_UNUSED_ARG(len))
+  {}
 };
 
 // specialization for strided loop thread policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(len));
     set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
-  template < typename IdxT >
-  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template <typename IdxT>
+  static void
+  set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
 // specialization for strided loop block policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(len));
@@ -450,35 +515,43 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template <typename IdxT>
+  static void
+  set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 // specialization for strided loop global policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    if (len > static_cast<IdxT>(0)) {
+    if (len > static_cast<IdxT>(0))
+    {
       set_hip_dim<dim>(dims.threads, static_cast<IdxT>(1));
       set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(1));
       set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
@@ -487,62 +560,88 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
-    set_hip_dim<dim>(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
+    set_hip_dim<dim>(dims.threads,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
     set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(dims.blocks,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(1));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template <typename IdxT>
+  static void
+  set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp
index 6823647b48..4940c6d365 100644
--- a/include/RAJA/policy/hip/launch.hpp
+++ b/include/RAJA/policy/hip/launch.hpp
@@ -35,9 +35,9 @@ __global__ void launch_global_fcn(BODY body_in)
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto& body      = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
@@ -45,38 +45,45 @@ __global__ void launch_global_fcn(BODY body_in)
 }
 
 template <typename BODY, typename ReduceParams>
-__global__ void launch_new_reduce_global_fcn(BODY body_in, ReduceParams reduce_params)
+__global__ void launch_new_reduce_global_fcn(BODY body_in,
+                                             ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto& body      = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
-  RAJA::expt::invoke_body( reduce_params, body, ctx );
+  RAJA::expt::invoke_body(reduce_params, body, ctx);
 
-  //Using a flatten global policy as we may use all dimensions
-  RAJA::expt::ParamMultiplexer::combine<RAJA::hip_flatten_global_xyz_direct>(reduce_params);
+  // Using a flatten global policy as we may use all dimensions
+  RAJA::expt::ParamMultiplexer::combine<RAJA::hip_flatten_global_xyz_direct>(
+      reduce_params);
 }
 
 template <bool async>
-struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>> {
+struct LaunchExecute<
+    RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>>
+{
 
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = reinterpret_cast<const void*>(
-        &launch_global_fcn<BODY>);
+    auto func = reinterpret_cast<const void*>(&launch_global_fcn<BODY>);
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
@@ -84,18 +91,20 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(params.teams.value[0]),
+    hip_dim_t gridSize {static_cast<hip_dim_member_t>(params.teams.value[0]),
                         static_cast<hip_dim_member_t>(params.teams.value[1]),
-                        static_cast<hip_dim_member_t>(params.teams.value[2]) };
+                        static_cast<hip_dim_member_t>(params.teams.value[2])};
 
-    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(params.threads.value[0]),
-                         static_cast<hip_dim_member_t>(params.threads.value[1]),
-                         static_cast<hip_dim_member_t>(params.threads.value[2]) };
+    hip_dim_t blockSize {
+        static_cast<hip_dim_member_t>(params.threads.value[0]),
+        static_cast<hip_dim_member_t>(params.threads.value[1]),
+        static_cast<hip_dim_member_t>(params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
@@ -105,14 +114,16 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
+                                                shared_mem_size, hip_res,
+                                                std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name);
+        void* args[] = {(void*)&body};
+        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
+                          hip_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -122,13 +133,18 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
   }
 
 
- //Version with explicit reduction parameters..
+  // Version with explicit reduction parameters..
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, const LaunchParams &launch_params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &launch_reducers)
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& launch_reducers)
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -141,45 +157,53 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(launch_params.teams.value[0]),
-                        static_cast<hip_dim_member_t>(launch_params.teams.value[1]),
-                        static_cast<hip_dim_member_t>(launch_params.teams.value[2]) };
+    hip_dim_t gridSize {
+        static_cast<hip_dim_member_t>(launch_params.teams.value[0]),
+        static_cast<hip_dim_member_t>(launch_params.teams.value[1]),
+        static_cast<hip_dim_member_t>(launch_params.teams.value[2])};
 
-    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(launch_params.threads.value[0]),
-                         static_cast<hip_dim_member_t>(launch_params.threads.value[1]),
-                         static_cast<hip_dim_member_t>(launch_params.threads.value[2]) };
+    hip_dim_t blockSize {
+        static_cast<hip_dim_member_t>(launch_params.threads.value[0]),
+        static_cast<hip_dim_member_t>(launch_params.threads.value[1]),
+        static_cast<hip_dim_member_t>(launch_params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
       size_t shared_mem_size = launch_params.shared_mem_size;
       RAJA::hip::detail::hipInfo launch_info;
-      launch_info.gridDim = gridSize;
-      launch_info.blockDim = blockSize;
+      launch_info.gridDim      = gridSize;
+      launch_info.blockDim     = blockSize;
       launch_info.dynamic_smem = &shared_mem_size;
-      launch_info.res = hip_res;
+      launch_info.res          = hip_res;
 
       {
-        using EXEC_POL = RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers, launch_info);
+        using EXEC_POL =
+            RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>;
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
+                                                     launch_info);
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
+                                                shared_mem_size, hip_res,
+                                                std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name);
+        void* args[] = {(void*)&body, (void*)&launch_reducers};
+        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
+                          hip_res, async, kernel_name);
 
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers, launch_info);
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
+                                                        launch_info);
       }
 
       RAJA_FT_END;
@@ -187,21 +211,20 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
 
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
 
 template <typename BODY, int num_threads>
 __launch_bounds__(num_threads, 1) __global__
-void launch_global_fcn_fixed(BODY body_in)
+    void launch_global_fcn_fixed(BODY body_in)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto& body      = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
@@ -210,39 +233,46 @@ void launch_global_fcn_fixed(BODY body_in)
 
 template <typename BODY, int num_threads, typename ReduceParams>
 __launch_bounds__(num_threads, 1) __global__
-void launch_new_reduce_global_fcn_fixed(BODY body_in, ReduceParams reduce_params)
+    void launch_new_reduce_global_fcn_fixed(BODY body_in,
+                                            ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto& body      = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
-  RAJA::expt::invoke_body( reduce_params, body, ctx );
+  RAJA::expt::invoke_body(reduce_params, body, ctx);
 
-  //Using a flatten global policy as we may use all dimensions
-  RAJA::expt::ParamMultiplexer::combine<RAJA::hip_flatten_global_xyz_direct>(reduce_params);
+  // Using a flatten global policy as we may use all dimensions
+  RAJA::expt::ParamMultiplexer::combine<RAJA::hip_flatten_global_xyz_direct>(
+      reduce_params);
 }
 
 
 template <bool async, int nthreads>
-struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
+struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
+{
 
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = reinterpret_cast<const void*>(
-        &launch_global_fcn_fixed<BODY, nthreads>);
+    auto func =
+        reinterpret_cast<const void*>(&launch_global_fcn_fixed<BODY, nthreads>);
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
@@ -250,18 +280,20 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(params.teams.value[0]),
+    hip_dim_t gridSize {static_cast<hip_dim_member_t>(params.teams.value[0]),
                         static_cast<hip_dim_member_t>(params.teams.value[1]),
-                        static_cast<hip_dim_member_t>(params.teams.value[2]) };
+                        static_cast<hip_dim_member_t>(params.teams.value[2])};
 
-    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(params.threads.value[0]),
-                         static_cast<hip_dim_member_t>(params.threads.value[1]),
-                         static_cast<hip_dim_member_t>(params.threads.value[2]) };
+    hip_dim_t blockSize {
+        static_cast<hip_dim_member_t>(params.threads.value[0]),
+        static_cast<hip_dim_member_t>(params.threads.value[1]),
+        static_cast<hip_dim_member_t>(params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
@@ -270,14 +302,16 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
+                                                shared_mem_size, hip_res,
+                                                std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name);
+        void* args[] = {(void*)&body};
+        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
+                          hip_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -286,18 +320,24 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
     return resources::EventProxy<resources::Resource>(res);
   }
 
- //Version with explicit reduction parameters..
+  // Version with explicit reduction parameters..
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, const LaunchParams &launch_params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &launch_reducers)
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& launch_reducers)
   {
     using BODY = camp::decay<BODY_IN>;
 
     auto func = reinterpret_cast<const void*>(
-        &launch_new_reduce_global_fcn_fixed<BODY, nthreads, camp::decay<ReduceParams>>);
+        &launch_new_reduce_global_fcn_fixed<BODY, nthreads,
+                                            camp::decay<ReduceParams>>);
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
@@ -305,45 +345,53 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(launch_params.teams.value[0]),
-                        static_cast<hip_dim_member_t>(launch_params.teams.value[1]),
-                        static_cast<hip_dim_member_t>(launch_params.teams.value[2]) };
+    hip_dim_t gridSize {
+        static_cast<hip_dim_member_t>(launch_params.teams.value[0]),
+        static_cast<hip_dim_member_t>(launch_params.teams.value[1]),
+        static_cast<hip_dim_member_t>(launch_params.teams.value[2])};
 
-    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(launch_params.threads.value[0]),
-                         static_cast<hip_dim_member_t>(launch_params.threads.value[1]),
-                         static_cast<hip_dim_member_t>(launch_params.threads.value[2]) };
+    hip_dim_t blockSize {
+        static_cast<hip_dim_member_t>(launch_params.threads.value[0]),
+        static_cast<hip_dim_member_t>(launch_params.threads.value[1]),
+        static_cast<hip_dim_member_t>(launch_params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
       size_t shared_mem_size = launch_params.shared_mem_size;
       RAJA::hip::detail::hipInfo launch_info;
-      launch_info.gridDim = gridSize;
-      launch_info.blockDim = blockSize;
+      launch_info.gridDim      = gridSize;
+      launch_info.blockDim     = blockSize;
       launch_info.dynamic_smem = &shared_mem_size;
-      launch_info.res = hip_res;
+      launch_info.res          = hip_res;
 
       {
-        using EXEC_POL = RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers, launch_info);
+        using EXEC_POL =
+            RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>;
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
+                                                     launch_info);
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
+                                                shared_mem_size, hip_res,
+                                                std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name);
+        void* args[] = {(void*)&body, (void*)&launch_reducers};
+        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
+                          hip_res, async, kernel_name);
 
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers, launch_info);
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
+                                                        launch_info);
       }
 
       RAJA_FT_END;
@@ -351,7 +399,6 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
 
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
 
@@ -359,43 +406,50 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
    HIP generic loop implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE
-  void exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-            SEGMENT const &segment,
-            BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -403,29 +457,36 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
     const diff_t i0 = IndexMapper0::template index<diff_t>();
     const diff_t i1 = IndexMapper1::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1) {
+    if (i0 < len0 && i1 < len1)
+    {
       body(*(segment0.begin() + i0), *(segment1.begin() + i1));
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1,
-                                                  IndexMapper2>,
-                   SEGMENT> {
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1,
+                                   IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -435,53 +496,62 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
     const diff_t i1 = IndexMapper1::template index<diff_t>();
     const diff_t i2 = IndexMapper2::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1 && i2 < len2) {
-      body(*(segment0.begin() + i0),
-           *(segment1.begin() + i1),
+    if (i0 < len0 && i1 < len1 && i2 < len2)
+    {
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1),
            *(segment2.begin() + i2));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE
-  void exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-            SEGMENT const &segment,
-            BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -492,34 +562,42 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        body(*(segment0.begin() + i0),
-             *(segment1.begin() + i1));
+        body(*(segment0.begin() + i0), *(segment1.begin() + i1));
       }
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1,
-                                                  IndexMapper2>,
-                   SEGMENT> {
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -533,14 +611,16 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
     const diff_t i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride) {
+        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
+        {
 
-          body(*(segment0.begin() + i0),
-               *(segment1.begin() + i1),
+          body(*(segment0.begin() + i0), *(segment1.begin() + i1),
                *(segment2.begin() + i2));
         }
       }
@@ -549,42 +629,49 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i), i);
     }
   }
 };
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -592,31 +679,36 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
     const diff_t i0 = IndexMapper0::template index<diff_t>();
     const diff_t i1 = IndexMapper1::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1) {
-      body(*(segment0.begin() + i0),
-           *(segment1.begin() + i1),
-           i0, i1);
+    if (i0 < len0 && i1 < len1)
+    {
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1), i0, i1);
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1,
-                                                        IndexMapper2>,
-                         SEGMENT> {
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1,
+                                   IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -626,54 +718,62 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
     const diff_t i1 = IndexMapper1::template index<diff_t>();
     const diff_t i2 = IndexMapper2::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1 && i2 < len2) {
-      body(*(segment0.begin() + i0),
-           *(segment1.begin() + i1),
-           *(segment2.begin() + i2),
-           i0, i1, i2);
+    if (i0 < len0 && i1 < len1 && i2 < len2)
+    {
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1),
+           *(segment2.begin() + i2), i0, i1, i2);
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
       body(*(segment.begin() + i), i);
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -684,35 +784,42 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        body(*(segment0.begin() + i0),
-             *(segment1.begin() + i1),
-             i0, i1);
+        body(*(segment0.begin() + i0), *(segment1.begin() + i1), i0, i1);
       }
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1,
-                                                        IndexMapper2>,
-                         SEGMENT> {
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -726,16 +833,17 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
     const diff_t i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride) {
+        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
+        {
 
-          body(*(segment0.begin() + i0),
-               *(segment1.begin() + i1),
-               *(segment2.begin() + i2),
-               i0, i1, i2);
+          body(*(segment0.begin() + i0), *(segment1.begin() + i1),
+               *(segment2.begin() + i2), i0, i1, i2);
         }
       }
     }
@@ -746,31 +854,34 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
 /*
    HIP generic flattened loop implementations
 */
-template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          sync,
-                                                          IndexMapper0>,
-                   SEGMENT>
-    :  LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                  sync,
-                                                  IndexMapper0>,
-                   SEGMENT>
+template <typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                           sync,
+                                           IndexMapper0>,
+    SEGMENT>
+    : LoopExecute<
+          RAJA::policy::hip::
+              hip_indexer<RAJA::iteration_mapping::Direct, sync, IndexMapper0>,
+          SEGMENT>
 {};
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1>,
-                   SEGMENT>
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                           kernel_sync_requirement::none,
+                                           IndexMapper0,
+                                           IndexMapper1>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -779,29 +890,35 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
 
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
 
-    const int i = i0 + i0_stride*i1;
+    const int i = i0 + i0_stride * i1;
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1,
-                                                          IndexMapper2>,
-                   SEGMENT>
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                           kernel_sync_requirement::none,
+                                           IndexMapper0,
+                                           IndexMapper1,
+                                           IndexMapper2>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -812,39 +929,47 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    const int i = i0 + i0_stride*(i1 + i1_stride*i2);
+    const int i = i0 + i0_stride * (i1 + i1_stride * i2);
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          sync,
-                                                          IndexMapper0>,
-                   SEGMENT>
-    :  LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  sync,
-                                                  IndexMapper0>,
-                   SEGMENT>
+template <typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        sync,
+        IndexMapper0>,
+    SEGMENT>
+    : LoopExecute<
+          RAJA::policy::hip::hip_indexer<
+              RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+              sync,
+              IndexMapper0>,
+          SEGMENT>
 {};
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1>,
-                   SEGMENT>
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -854,29 +979,34 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
     const int i0_stride = IndexMapper0::template size<diff_t>();
     const int i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (int i = i0 + i0_stride*i1;
-         i < len;
-         i += i0_stride*i1_stride) {
+    for (int i = i0 + i0_stride * i1; i < len; i += i0_stride * i1_stride)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1,
-                                                          IndexMapper2>,
-                   SEGMENT>
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -888,9 +1018,9 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
     const int i1_stride = IndexMapper1::template size<diff_t>();
     const int i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (int i = i0 + i0_stride*(i1 + i1_stride*i2);
-         i < len;
-         i += i0_stride*i1_stride*i2_stride) {
+    for (int i = i0 + i0_stride * (i1 + i1_stride * i2); i < len;
+         i += i0_stride * i1_stride * i2_stride)
+    {
       body(*(segment.begin() + i));
     }
   }
@@ -901,101 +1031,122 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
    HIP generic tile implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct TileExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
 
-    if (i < len) {
+    if (i < len)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct TileExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(tile_size);
 
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct TileTCountExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t t = IndexMapper::template index<diff_t>();
-    const diff_t i = t * static_cast<diff_t>(tile_size);
+    const diff_t t   = IndexMapper::template index<diff_t>();
+    const diff_t i   = t * static_cast<diff_t>(tile_size);
 
-    if (i < len) {
+    if (i < len)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)), t);
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct TileTCountExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t t_init = IndexMapper::template index<diff_t>();
-    const diff_t i_init = t_init * static_cast<diff_t>(tile_size);
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t t_init   = IndexMapper::template index<diff_t>();
+    const diff_t i_init   = t_init * static_cast<diff_t>(tile_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(tile_size);
 
-    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
+    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)), t);
     }
   }
diff --git a/include/RAJA/policy/hip/multi_reduce.hpp b/include/RAJA/policy/hip/multi_reduce.hpp
index 0d9d3899d8..5f06445a0f 100644
--- a/include/RAJA/policy/hip/multi_reduce.hpp
+++ b/include/RAJA/policy/hip/multi_reduce.hpp
@@ -46,9 +46,9 @@
 #include "RAJA/policy/hip/intrinsics.hpp"
 
 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
-  #include "RAJA/policy/desul/atomic.hpp"
+#include "RAJA/policy/desul/atomic.hpp"
 #else
-  #include "RAJA/policy/hip/atomic.hpp"
+#include "RAJA/policy/hip/atomic.hpp"
 #endif
 
 #include "RAJA/policy/hip/policy.hpp"
@@ -73,100 +73,124 @@ namespace impl
 //
 
 //! combine value into global memory
-template <typename Combiner, typename GetTallyIndex,
-          typename T, typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
-                                                                      T identity,
-                                                                      int bin,
-                                                                      T value,
-                                                                      T* tally_mem,
-                                                                      GetTallyOffset get_tally_offset,
-                                                                      int tally_replication,
-                                                                      int tally_bins)
+template <typename Combiner,
+          typename GetTallyIndex,
+          typename T,
+          typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void
+block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
+                                         T identity,
+                                         int bin,
+                                         T value,
+                                         T* tally_mem,
+                                         GetTallyOffset get_tally_offset,
+                                         int tally_replication,
+                                         int tally_bins)
 {
-  if (value == identity) { return; }
+  if (value == identity)
+  {
+    return;
+  }
 
-  int tally_index = GetTallyIndex::template index<int>(); // globalWarpId by default
+  int tally_index =
+      GetTallyIndex::template index<int>();  // globalWarpId by default
   int tally_rep = ::RAJA::power_of_2_mod(tally_index, tally_replication);
-  int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
-  RAJA::reduce::hip::atomic<Combiner>{}(tally_mem[tally_offset], value);
+  int tally_offset =
+      get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
+  RAJA::reduce::hip::atomic<Combiner> {}(tally_mem[tally_offset], value);
 }
 
 
 //! initialize shared memory
 template <typename T>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(int num_bins,
-                                                           T identity,
-                                                           T* shared_mem,
-                                                           int shared_replication)
+RAJA_DEVICE RAJA_INLINE void
+block_multi_reduce_init_shmem(int num_bins,
+                              T identity,
+                              T* shared_mem,
+                              int shared_replication)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
 
   for (int shmem_offset = threadId;
-       shmem_offset < shared_replication * num_bins;
-       shmem_offset += numThreads) {
+       shmem_offset < shared_replication * num_bins; shmem_offset += numThreads)
+  {
     shared_mem[shmem_offset] = identity;
   }
   __syncthreads();
 }
 
 //! combine value into shared memory
-template <typename Combiner, typename GetSharedIndex,
-          typename T, typename GetSharedOffset>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_shmem_atomic(int num_bins,
-                                                                     T identity,
-                                                                     int bin,
-                                                                     T value,
-                                                                     T* shared_mem,
-                                                                     GetSharedOffset get_shared_offset,
-                                                                     int shared_replication)
+template <typename Combiner,
+          typename GetSharedIndex,
+          typename T,
+          typename GetSharedOffset>
+RAJA_DEVICE RAJA_INLINE void
+block_multi_reduce_combine_shmem_atomic(int num_bins,
+                                        T identity,
+                                        int bin,
+                                        T value,
+                                        T* shared_mem,
+                                        GetSharedOffset get_shared_offset,
+                                        int shared_replication)
 {
-  if (value == identity) { return; }
+  if (value == identity)
+  {
+    return;
+  }
 
-  int shared_index = GetSharedIndex::template index<int>(); // threadId by default
+  int shared_index =
+      GetSharedIndex::template index<int>();  // threadId by default
   int shared_rep = ::RAJA::power_of_2_mod(shared_index, shared_replication);
-  int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication);
+  int shmem_offset =
+      get_shared_offset(bin, num_bins, shared_rep, shared_replication);
 
-  RAJA::reduce::hip::atomic<Combiner>{}(shared_mem[shmem_offset], value);
+  RAJA::reduce::hip::atomic<Combiner> {}(shared_mem[shmem_offset], value);
 }
 
 //! combine value into shared memory
 template <typename Combiner,
-          typename T, typename GetSharedOffset, typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(int num_bins,
-                                                                      T identity,
-                                                                      T* shared_mem,
-                                                                      GetSharedOffset get_shared_offset,
-                                                                      int shared_replication,
-                                                                      T* tally_mem,
-                                                                      GetTallyOffset get_tally_offset,
-                                                                      int tally_replication,
-                                                                      int tally_bins)
+          typename T,
+          typename GetSharedOffset,
+          typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void
+grid_multi_reduce_shmem_to_global_atomic(int num_bins,
+                                         T identity,
+                                         T* shared_mem,
+                                         GetSharedOffset get_shared_offset,
+                                         int shared_replication,
+                                         T* tally_mem,
+                                         GetTallyOffset get_tally_offset,
+                                         int tally_replication,
+                                         int tally_bins)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
 
   int blockId = blockIdx.x + gridDim.x * blockIdx.y +
-                 (gridDim.x * gridDim.y) * blockIdx.z;
+                (gridDim.x * gridDim.y) * blockIdx.z;
 
   __syncthreads();
-  for (int bin = threadId; bin < num_bins; bin += numThreads) {
+  for (int bin = threadId; bin < num_bins; bin += numThreads)
+  {
 
     T value = identity;
-    for (int shared_rep = 0; shared_rep < shared_replication; ++shared_rep) {
-      int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication);
-      Combiner{}(value, shared_mem[shmem_offset]);
+    for (int shared_rep = 0; shared_rep < shared_replication; ++shared_rep)
+    {
+      int shmem_offset =
+          get_shared_offset(bin, num_bins, shared_rep, shared_replication);
+      Combiner {}(value, shared_mem[shmem_offset]);
     }
 
-    if (value != identity) {
+    if (value != identity)
+    {
       int tally_rep = ::RAJA::power_of_2_mod(blockId, tally_replication);
-      int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
-      RAJA::reduce::hip::atomic<Combiner>{}(tally_mem[tally_offset], value);
+      int tally_offset =
+          get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
+      RAJA::reduce::hip::atomic<Combiner> {}(tally_mem[tally_offset], value);
     }
-
   }
 }
 
@@ -185,48 +209,63 @@ template <typename Combiner, typename T, typename tuning>
 struct MultiReduceGridAtomicHostInit_TallyData
 {
   //! setup permanent settings, allocate and initialize tally memory
-  template < typename Container >
-  MultiReduceGridAtomicHostInit_TallyData(Container const& container, T const& identity)
-      : m_tally_mem(nullptr)
-      , m_identity(identity)
-      , m_num_bins(container.size())
-      , m_tally_bins(get_tally_bins(m_num_bins))
-      , m_tally_replication(get_tally_replication())
+  template <typename Container>
+  MultiReduceGridAtomicHostInit_TallyData(Container const& container,
+                                          T const& identity)
+      : m_tally_mem(nullptr),
+        m_identity(identity),
+        m_num_bins(container.size()),
+        m_tally_bins(get_tally_bins(m_num_bins)),
+        m_tally_replication(get_tally_replication())
   {
-    m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication);
+    m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
+                               m_tally_replication);
   }
 
   MultiReduceGridAtomicHostInit_TallyData() = delete;
-  MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData const&) = default;
-  MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData &&) = delete;
-  MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData const&) = default;
-  MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData &&) = delete;
-  ~MultiReduceGridAtomicHostInit_TallyData() = default;
+  MultiReduceGridAtomicHostInit_TallyData(
+      MultiReduceGridAtomicHostInit_TallyData const&) = default;
+  MultiReduceGridAtomicHostInit_TallyData(
+      MultiReduceGridAtomicHostInit_TallyData&&) = delete;
+  MultiReduceGridAtomicHostInit_TallyData&
+  operator=(MultiReduceGridAtomicHostInit_TallyData const&) = default;
+  MultiReduceGridAtomicHostInit_TallyData&
+  operator=(MultiReduceGridAtomicHostInit_TallyData&&) = delete;
+  ~MultiReduceGridAtomicHostInit_TallyData()           = default;
 
 
   //! reset permanent settings, reallocate and reset tally memory
-  template < typename Container >
+  template <typename Container>
   void reset_permanent(Container const& container, T const& identity)
   {
     int new_num_bins = container.size();
-    if (new_num_bins != m_num_bins) {
+    if (new_num_bins != m_num_bins)
+    {
       teardown_permanent();
-      m_num_bins = new_num_bins;
-      m_tally_bins = get_tally_bins(m_num_bins);
+      m_num_bins          = new_num_bins;
+      m_tally_bins        = get_tally_bins(m_num_bins);
       m_tally_replication = get_tally_replication();
-      m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication);
-    } else {
+      m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
+                                 m_tally_replication);
+    }
+    else
+    {
       {
         int tally_rep = 0;
-        int bin = 0;
-        for (auto const& value : container) {
-          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = value;
+        int bin       = 0;
+        for (auto const& value : container)
+        {
+          m_tally_mem[GetTallyOffset {}(bin, m_tally_bins, tally_rep,
+                                        m_tally_replication)] = value;
           ++bin;
         }
       }
-      for (int tally_rep = 1; tally_rep < m_tally_replication; ++tally_rep) {
-        for (int bin = 0; bin < m_num_bins; ++bin) {
-          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = identity;
+      for (int tally_rep = 1; tally_rep < m_tally_replication; ++tally_rep)
+      {
+        for (int bin = 0; bin < m_num_bins; ++bin)
+        {
+          m_tally_mem[GetTallyOffset {}(bin, m_tally_bins, tally_rep,
+                                        m_tally_replication)] = identity;
         }
       }
     }
@@ -244,9 +283,11 @@ struct MultiReduceGridAtomicHostInit_TallyData
   T get(int bin) const
   {
     ::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
-          reducer(m_identity);
-    for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep) {
-      int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+        reducer(m_identity);
+    for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep)
+    {
+      int tally_offset =
+          GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
       reducer.combine(m_tally_mem[tally_offset]);
     }
     return reducer.get_and_clear();
@@ -258,20 +299,27 @@ struct MultiReduceGridAtomicHostInit_TallyData
   T identity() const { return m_identity; }
 
 private:
-  static constexpr size_t s_tally_alignment = std::max(size_t(policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE),
-                                                       size_t(RAJA::DATA_ALIGN));
-  static constexpr size_t s_tally_bunch_size = RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T));
+  static constexpr size_t s_tally_alignment = std::max(
+      size_t(
+          policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE),
+      size_t(RAJA::DATA_ALIGN));
+  static constexpr size_t s_tally_bunch_size =
+      RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T));
 
   using tally_mempool_type = device_pinned_mempool_type;
-  using tally_tuning = typename tuning::GlobalAtomicReplicationTuning;
-  using TallyAtomicReplicationConcretizer = typename tally_tuning::AtomicReplicationConcretizer;
+  using tally_tuning       = typename tuning::GlobalAtomicReplicationTuning;
+  using TallyAtomicReplicationConcretizer =
+      typename tally_tuning::AtomicReplicationConcretizer;
   using GetTallyOffset_rebind_rebunch = typename tally_tuning::OffsetCalculator;
-  using GetTallyOffset_rebind = typename GetTallyOffset_rebind_rebunch::template rebunch<s_tally_bunch_size>;
+  using GetTallyOffset_rebind =
+      typename GetTallyOffset_rebind_rebunch::template rebunch<
+          s_tally_bunch_size>;
 
 
   static int get_tally_bins(int num_bins)
   {
-    return RAJA_DIVIDE_CEILING_INT(num_bins, s_tally_bunch_size) * s_tally_bunch_size;
+    return RAJA_DIVIDE_CEILING_INT(num_bins, s_tally_bunch_size) *
+           s_tally_bunch_size;
   }
 
   static int get_tally_replication()
@@ -281,39 +329,50 @@ struct MultiReduceGridAtomicHostInit_TallyData
     min_tally_replication = omp_get_max_threads();
 #endif
 
-    struct {
+    struct
+    {
       int func_min_global_replication;
-    } func_data{min_tally_replication};
+    } func_data {min_tally_replication};
 
-    return TallyAtomicReplicationConcretizer{}.template
-        get_global_replication<int>(func_data);
+    return TallyAtomicReplicationConcretizer {}
+        .template get_global_replication<int>(func_data);
   }
 
-  template < typename Container >
-  static T* create_tally(Container const& container, T const& identity,
-                         int num_bins, int tally_bins, int tally_replication)
+  template <typename Container>
+  static T* create_tally(Container const& container,
+                         T const& identity,
+                         int num_bins,
+                         int tally_bins,
+                         int tally_replication)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return nullptr;
     }
 
     T* tally_mem = tally_mempool_type::getInstance().template malloc<T>(
-        tally_replication*tally_bins, s_tally_alignment);
+        tally_replication * tally_bins, s_tally_alignment);
 
-    if (tally_replication > 0) {
+    if (tally_replication > 0)
+    {
       {
         int tally_rep = 0;
-        int bin = 0;
-        for (auto const& value : container) {
-          int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
-          new(&tally_mem[tally_offset]) T(value);
+        int bin       = 0;
+        for (auto const& value : container)
+        {
+          int tally_offset =
+              GetTallyOffset {}(bin, tally_bins, tally_rep, tally_replication);
+          new (&tally_mem[tally_offset]) T(value);
           ++bin;
         }
       }
-      for (int tally_rep = 1; tally_rep < tally_replication; ++tally_rep) {
-        for (int bin = 0; bin < num_bins; ++bin) {
-          int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
-          new(&tally_mem[tally_offset]) T(identity);
+      for (int tally_rep = 1; tally_rep < tally_replication; ++tally_rep)
+      {
+        for (int bin = 0; bin < num_bins; ++bin)
+        {
+          int tally_offset =
+              GetTallyOffset {}(bin, tally_bins, tally_rep, tally_replication);
+          new (&tally_mem[tally_offset]) T(identity);
         }
       }
     }
@@ -321,15 +380,21 @@ struct MultiReduceGridAtomicHostInit_TallyData
   }
 
   static void destroy_tally(T*& tally_mem,
-                            int num_bins, int tally_bins, int tally_replication)
+                            int num_bins,
+                            int tally_bins,
+                            int tally_replication)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return;
     }
 
-    for (int tally_rep = tally_replication+1; tally_rep > 0; --tally_rep) {
-      for (int bin = num_bins; bin > 0; --bin) {
-        int tally_offset = GetTallyOffset{}(bin-1, tally_bins, tally_rep-1, tally_replication);
+    for (int tally_rep = tally_replication + 1; tally_rep > 0; --tally_rep)
+    {
+      for (int bin = num_bins; bin > 0; --bin)
+      {
+        int tally_offset = GetTallyOffset {}(bin - 1, tally_bins, tally_rep - 1,
+                                             tally_replication);
         tally_mem[tally_offset].~T();
       }
     }
@@ -338,14 +403,15 @@ struct MultiReduceGridAtomicHostInit_TallyData
   }
 
 protected:
-  using GetTallyIndex = typename tally_tuning::ReplicationIndexer;
+  using GetTallyIndex  = typename tally_tuning::ReplicationIndexer;
   using GetTallyOffset = typename GetTallyOffset_rebind::template rebind<int>;
 
   T* m_tally_mem;
   T m_identity;
   int m_num_bins;
   int m_tally_bins;
-  int m_tally_replication; // power of 2, at least the max number of omp threads
+  int m_tally_replication;  // power of 2, at least the max number of omp
+                            // threads
 };
 
 
@@ -354,34 +420,31 @@ template <typename Combiner, typename T, typename tuning>
 struct MultiReduceGridAtomicHostInit_Data
     : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
 {
-  using TallyData = MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
+  using TallyData =
+      MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
 
   //! defer to tally data for some functions
-  using TallyData::TallyData;
-  using TallyData::reset_permanent;
-  using TallyData::teardown_permanent;
   using TallyData::get;
-  using TallyData::num_bins;
   using TallyData::identity;
+  using TallyData::num_bins;
+  using TallyData::reset_permanent;
+  using TallyData::TallyData;
+  using TallyData::teardown_permanent;
 
   //! setup per launch, do nothing
-  void setup_launch(size_t RAJA_UNUSED_ARG(block_size))
-  { }
+  void setup_launch(size_t RAJA_UNUSED_ARG(block_size)) {}
 
   //! teardown per launch, do nothing
-  void teardown_launch()
-  { }
+  void teardown_launch() {}
 
 
   //! setup on device, do nothing
   RAJA_DEVICE
-  void setup_device()
-  { }
+  void setup_device() {}
 
   //! finalize on device, do nothing
   RAJA_DEVICE
-  void finalize_device()
-  { }
+  void finalize_device() {}
 
 
   //! combine value on device, combine a value into the tally atomically
@@ -389,9 +452,8 @@ struct MultiReduceGridAtomicHostInit_Data
   void combine_device(int bin, T value)
   {
     impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-        m_num_bins, m_identity,
-        bin, value,
-        m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+        m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset {},
+        m_tally_replication, m_tally_bins);
   }
 
   //! combine value on host, combine a value into the tally
@@ -401,18 +463,19 @@ struct MultiReduceGridAtomicHostInit_Data
 #if defined(RAJA_ENABLE_OPENMP)
     tally_rep = omp_get_thread_num();
 #endif
-    int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
-    Combiner{}(m_tally_mem[tally_offset], value);
+    int tally_offset =
+        GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    Combiner {}(m_tally_mem[tally_offset], value);
   }
 
 private:
   using typename TallyData::GetTallyIndex;
   using typename TallyData::GetTallyOffset;
 
-  using TallyData::m_tally_mem;
   using TallyData::m_identity;
   using TallyData::m_num_bins;
   using TallyData::m_tally_bins;
+  using TallyData::m_tally_mem;
   using TallyData::m_tally_replication;
 };
 
@@ -422,57 +485,69 @@ template <typename Combiner, typename T, typename tuning>
 struct MultiReduceBlockThenGridAtomicHostInit_Data
     : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
 {
-  using TallyData = MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
+  using TallyData =
+      MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
 
   //! setup permanent settings, defer to tally data
-  template < typename Container >
-  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container, T const& identity)
-      : TallyData(container, identity)
-      , m_shared_offset(s_shared_offset_unknown)
-      , m_shared_replication(0)
-  { }
+  template <typename Container>
+  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container,
+                                              T const& identity)
+      : TallyData(container, identity),
+        m_shared_offset(s_shared_offset_unknown),
+        m_shared_replication(0)
+  {}
 
   MultiReduceBlockThenGridAtomicHostInit_Data() = delete;
-  MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
-  MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete;
-  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
-  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete;
-  ~MultiReduceBlockThenGridAtomicHostInit_Data() = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data(
+      MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data(
+      MultiReduceBlockThenGridAtomicHostInit_Data&&) = delete;
+  MultiReduceBlockThenGridAtomicHostInit_Data&
+  operator=(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data&
+  operator=(MultiReduceBlockThenGridAtomicHostInit_Data&&) = delete;
+  ~MultiReduceBlockThenGridAtomicHostInit_Data()           = default;
 
 
   //! defer to tally data for some functions
-  using TallyData::reset_permanent;
-  using TallyData::teardown_permanent;
   using TallyData::get;
-  using TallyData::num_bins;
   using TallyData::identity;
+  using TallyData::num_bins;
+  using TallyData::reset_permanent;
+  using TallyData::teardown_permanent;
 
   //! setup per launch, setup shared memory parameters
   void setup_launch(size_t block_size)
   {
-    if (m_num_bins == size_t(0)) {
+    if (m_num_bins == size_t(0))
+    {
       m_shared_offset = s_shared_offset_invalid;
       return;
     }
 
-    size_t shared_replication = 0;
+    size_t shared_replication  = 0;
     const size_t shared_offset = allocateDynamicShmem<T>(
-        [&](size_t max_shmem_size) {
-
-      struct {
-        size_t func_threads_per_block;
-        size_t func_max_shared_replication_per_block;
-      } func_data{block_size, max_shmem_size / m_num_bins};
-
-      shared_replication = SharedAtomicReplicationConcretizer{}.template
-          get_shared_replication<size_t>(func_data);
-      return m_num_bins * shared_replication;
-    });
-
-    if (shared_offset != dynamic_smem_allocation_failure) {
+        [&](size_t max_shmem_size)
+        {
+          struct
+          {
+            size_t func_threads_per_block;
+            size_t func_max_shared_replication_per_block;
+          } func_data {block_size, max_shmem_size / m_num_bins};
+
+          shared_replication =
+              SharedAtomicReplicationConcretizer {}
+                  .template get_shared_replication<size_t>(func_data);
+          return m_num_bins * shared_replication;
+        });
+
+    if (shared_offset != dynamic_smem_allocation_failure)
+    {
       m_shared_replication = static_cast<int>(shared_replication);
-      m_shared_offset = static_cast<int>(shared_offset);
-    } else {
+      m_shared_offset      = static_cast<int>(shared_offset);
+    }
+    else
+    {
       m_shared_offset = s_shared_offset_invalid;
     }
   }
@@ -481,7 +556,7 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void teardown_launch()
   {
     m_shared_replication = 0;
-    m_shared_offset = s_shared_offset_unknown;
+    m_shared_offset      = s_shared_offset_unknown;
   }
 
 
@@ -490,10 +565,10 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void setup_device()
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr) {
-      impl::block_multi_reduce_init_shmem(
-          m_num_bins, m_identity,
-          shared_mem, m_shared_replication);
+    if (shared_mem != nullptr)
+    {
+      impl::block_multi_reduce_init_shmem(m_num_bins, m_identity, shared_mem,
+                                          m_shared_replication);
     }
   }
 
@@ -502,11 +577,12 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void finalize_device()
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr) {
+    if (shared_mem != nullptr)
+    {
       impl::grid_multi_reduce_shmem_to_global_atomic<Combiner>(
-          m_num_bins, m_identity,
-          shared_mem, GetSharedOffset{}, m_shared_replication,
-          m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+          m_num_bins, m_identity, shared_mem, GetSharedOffset {},
+          m_shared_replication, m_tally_mem, GetTallyOffset {},
+          m_tally_replication, m_tally_bins);
     }
   }
 
@@ -516,16 +592,17 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void combine_device(int bin, T value)
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr) {
+    if (shared_mem != nullptr)
+    {
       impl::block_multi_reduce_combine_shmem_atomic<Combiner, GetSharedIndex>(
-          m_num_bins, m_identity,
-          bin, value,
-          shared_mem, GetSharedOffset{}, m_shared_replication);
-    } else {
+          m_num_bins, m_identity, bin, value, shared_mem, GetSharedOffset {},
+          m_shared_replication);
+    }
+    else
+    {
       impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-          m_num_bins, m_identity,
-          bin, value,
-          m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+          m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset {},
+          m_tally_replication, m_tally_bins);
     }
   }
 
@@ -536,14 +613,16 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
 #if defined(RAJA_ENABLE_OPENMP)
     tally_rep = omp_get_thread_num();
 #endif
-    int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
-    Combiner{}(m_tally_mem[tally_offset], value);
+    int tally_offset =
+        GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    Combiner {}(m_tally_mem[tally_offset], value);
   }
 
 private:
   using shared_tuning = typename tuning::SharedAtomicReplicationTuning;
-  using SharedAtomicReplicationConcretizer = typename shared_tuning::AtomicReplicationConcretizer;
-  using GetSharedIndex = typename shared_tuning::ReplicationIndexer;
+  using SharedAtomicReplicationConcretizer =
+      typename shared_tuning::AtomicReplicationConcretizer;
+  using GetSharedIndex         = typename shared_tuning::ReplicationIndexer;
   using GetSharedOffset_rebind = typename shared_tuning::OffsetCalculator;
   using GetSharedOffset = typename GetSharedOffset_rebind::template rebind<int>;
 
@@ -551,24 +630,27 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   using typename TallyData::GetTallyOffset;
 
 
-  static constexpr int s_shared_offset_unknown = std::numeric_limits<int>::max();
-  static constexpr int s_shared_offset_invalid = std::numeric_limits<int>::max() - 1;
+  static constexpr int s_shared_offset_unknown =
+      std::numeric_limits<int>::max();
+  static constexpr int s_shared_offset_invalid =
+      std::numeric_limits<int>::max() - 1;
 
 
-  using TallyData::m_tally_mem;
   using TallyData::m_identity;
   using TallyData::m_num_bins;
   using TallyData::m_tally_bins;
+  using TallyData::m_tally_mem;
   using TallyData::m_tally_replication;
 
-  int m_shared_offset; // in bytes
-  int m_shared_replication; // power of 2
+  int m_shared_offset;       // in bytes
+  int m_shared_replication;  // power of 2
 
 
   RAJA_DEVICE
   T* get_shared_mem() const
   {
-    if (m_shared_offset == s_shared_offset_invalid) {
+    if (m_shared_offset == s_shared_offset_invalid)
+    {
       return nullptr;
     }
     extern __shared__ char shared_mem[];
@@ -595,39 +677,49 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp, typename tuning >
+template <typename T, typename t_MultiReduceOp, typename tuning>
 struct MultiReduceDataHip
 {
-  static constexpr bool atomic_available = RAJA::reduce::hip::hip_atomic_available<T>::value;
+  static constexpr bool atomic_available =
+      RAJA::reduce::hip::hip_atomic_available<T>::value;
 
   //! hip reduction data storage class and folding algorithm
-  using reduce_data_type =
-      std::conditional_t<(atomic_available),
-        std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic),
-          hip::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp, T, tuning>,
-          std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_global_atomic),
-            hip::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp, T, tuning>,
-            void>>,
+  using reduce_data_type = std::conditional_t<
+      (atomic_available),
+      std::conditional_t<
+          (tuning::algorithm ==
+           multi_reduce_algorithm::
+               init_host_combine_block_atomic_then_grid_atomic),
+          hip::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp,
+                                                           T,
+                                                           tuning>,
+          std::conditional_t<
+              (tuning::algorithm ==
+               multi_reduce_algorithm::init_host_combine_global_atomic),
+              hip::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp,
+                                                      T,
+                                                      tuning>,
+              void>>,
       void>;
 
 
   using SyncList = std::vector<resources::Hip>;
 
 public:
-  using value_type = T;
+  using value_type    = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataHip() = delete;
 
-  template < typename Container,
-             std::enable_if_t<!std::is_same<Container, MultiReduceDataHip>::value>* = nullptr >
+  template <typename Container,
+            std::enable_if_t<
+                !std::is_same<Container, MultiReduceDataHip>::value>* = nullptr>
   MultiReduceDataHip(Container const& container, T identity)
-      : m_parent(this)
-      , m_sync_list(new SyncList)
-      , m_data(container, identity)
-      , m_own_launch_data(false)
-  {
-  }
+      : m_parent(this),
+        m_sync_list(new SyncList),
+        m_data(container, identity),
+        m_own_launch_data(false)
+  {}
 
   //! copy and on host attempt to setup for device
   //  init val_ptr to avoid uninitialized read caused by host copy of
@@ -639,31 +731,35 @@ struct MultiReduceDataHip
 #else
       : m_parent(&other)
 #endif
-      , m_sync_list(other.m_sync_list)
-      , m_data(other.m_data)
-      , m_own_launch_data(false)
+        ,
+        m_sync_list(other.m_sync_list),
+        m_data(other.m_data),
+        m_own_launch_data(false)
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (m_parent) {
-      if (setupReducers()) {
+    if (m_parent)
+    {
+      if (setupReducers())
+      {
         // the copy made in make_launch_body does this setup
         add_resource_to_synchronization_list(currentResource());
         m_data.setup_launch(currentBlockSize());
         m_own_launch_data = true;
-        m_parent = nullptr;
+        m_parent          = nullptr;
       }
     }
 #else
-    if (!m_parent->m_parent) {
+    if (!m_parent->m_parent)
+    {
       // the first copy on device enters this branch
       m_data.setup_device();
     }
 #endif
   }
 
-  MultiReduceDataHip(MultiReduceDataHip &&) = delete;
+  MultiReduceDataHip(MultiReduceDataHip&&)                 = delete;
   MultiReduceDataHip& operator=(MultiReduceDataHip const&) = delete;
-  MultiReduceDataHip& operator=(MultiReduceDataHip &&) = delete;
+  MultiReduceDataHip& operator=(MultiReduceDataHip&&)      = delete;
 
   //! cleanup resources owned by this copy
   //  on device store in pinned buffer on host
@@ -671,23 +767,30 @@ struct MultiReduceDataHip
   ~MultiReduceDataHip()
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (m_parent == this) {
+    if (m_parent == this)
+    {
       // the original object, owns permanent storage
       synchronize_resources_and_clear_list();
       delete m_sync_list;
       m_sync_list = nullptr;
       m_data.teardown_permanent();
-    } else if (m_parent) {
+    }
+    else if (m_parent)
+    {
       // do nothing
-    } else {
-      if (m_own_launch_data) {
+    }
+    else
+    {
+      if (m_own_launch_data)
+      {
         // the copy made in make_launch_body, owns launch data
         m_data.teardown_launch();
         m_own_launch_data = false;
       }
     }
 #else
-    if (!m_parent->m_parent) {
+    if (!m_parent->m_parent)
+    {
       // the first copy on device, does finalization on the device
       m_data.finalize_device();
     }
@@ -695,7 +798,7 @@ struct MultiReduceDataHip
   }
 
 
-  template < typename Container >
+  template <typename Container>
   void reset(Container const& container, T identity)
   {
     synchronize_resources_and_clear_list();
@@ -729,15 +832,17 @@ struct MultiReduceDataHip
 
 
 private:
-  MultiReduceDataHip const *m_parent;
+  MultiReduceDataHip const* m_parent;
   SyncList* m_sync_list;
   reduce_data_type m_data;
   bool m_own_launch_data;
 
   void add_resource_to_synchronization_list(resources::Hip res)
   {
-    for (resources::Hip& list_res : *m_sync_list) {
-      if (list_res.get_stream() == res.get_stream()) {
+    for (resources::Hip& list_res : *m_sync_list)
+    {
+      if (list_res.get_stream() == res.get_stream())
+      {
         return;
       }
     }
@@ -746,7 +851,8 @@ struct MultiReduceDataHip
 
   void synchronize_resources_and_clear_list()
   {
-    for (resources::Hip& list_res : *m_sync_list) {
+    for (resources::Hip& list_res : *m_sync_list)
+    {
       ::RAJA::hip::synchronize(list_res);
     }
     m_sync_list->clear();
@@ -755,7 +861,8 @@ struct MultiReduceDataHip
 
 }  // end namespace hip
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::hip::hip_multi_reduce_policy, hip::MultiReduceDataHip)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::hip::hip_multi_reduce_policy,
+                                hip::MultiReduceDataHip)
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/hip/params/kernel_name.hpp b/include/RAJA/policy/hip/params/kernel_name.hpp
index 30269f8406..db4d204aeb 100644
--- a/include/RAJA/policy/hip/params/kernel_name.hpp
+++ b/include/RAJA/policy/hip/params/kernel_name.hpp
@@ -11,42 +11,45 @@
 #include "roctx.h"
 #endif
 
-namespace RAJA {
-namespace expt {
-namespace detail {
-
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  init(KernelName& kn, const RAJA::hip::detail::hipInfo &)
-  {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+// Init
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
+init(KernelName& kn, const RAJA::hip::detail::hipInfo&)
+{
 #if defined(RAJA_ENABLE_ROCTX)
-    roctxRangePush(kn.name);
+  roctxRangePush(kn.name);
 #else
-    RAJA_UNUSED_VAR(kn);
+  RAJA_UNUSED_VAR(kn);
 #endif
-  }
-
-  // Combine
-  template<typename EXEC_POL>
-  RAJA_HOST_DEVICE
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  combine(KernelName&) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  resolve(KernelName&, const RAJA::hip::detail::hipInfo &)
-  {
+}
+
+// Combine
+template <typename EXEC_POL>
+RAJA_HOST_DEVICE camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
+combine(KernelName&)
+{}
+
+// Resolve
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
+resolve(KernelName&, const RAJA::hip::detail::hipInfo&)
+{
 #if defined(RAJA_ENABLE_ROCTX)
-    roctxRangePop();
+  roctxRangePop();
 #endif
-  }
+}
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 #endif
 
-#endif //  NEW_REDUCE_HIP_REDUCE_HPP
+#endif  //  NEW_REDUCE_HIP_REDUCE_HPP
diff --git a/include/RAJA/policy/hip/params/reduce.hpp b/include/RAJA/policy/hip/params/reduce.hpp
index a3da07ee2c..38dd12b43a 100644
--- a/include/RAJA/policy/hip/params/reduce.hpp
+++ b/include/RAJA/policy/hip/params/reduce.hpp
@@ -8,54 +8,56 @@
 #include "RAJA/policy/hip/reduce.hpp"
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
-
-  // Init
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  init(Reducer<OP, T, VOp>& red, RAJA::hip::detail::hipInfo& hi)
-  {
-    red.devicetarget = RAJA::hip::pinned_mempool_type::getInstance().template malloc<T>(1);
-    red.device_mem.allocate(hi.gridDim.x * hi.gridDim.y * hi.gridDim.z);
-    red.device_count = RAJA::hip::device_zeroed_mempool_type::getInstance().template malloc<unsigned int>(1);
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  RAJA_HOST_DEVICE
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  combine(Reducer<OP, T, VOp>& red)
-  {
-    RAJA::hip::impl::expt::grid_reduce<typename EXEC_POL::IterationGetter,OP>( red.devicetarget,
-                                                                            red.getVal(),
-                                                                            red.device_mem,
-                                                                            red.device_count);
-  }
-
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  resolve(Reducer<OP, T, VOp>& red, RAJA::hip::detail::hipInfo& hi)
-  {
-    // complete reduction
-    hi.res.wait();
-
-    red.combineTarget(*red.devicetarget);
-
-    // free memory
-    RAJA::hip::device_zeroed_mempool_type::getInstance().free(red.device_count);
-    red.device_count = nullptr;
-    red.device_mem.deallocate();
-    RAJA::hip::pinned_mempool_type::getInstance().free(red.devicetarget);
-    red.devicetarget = nullptr;
-  }
-
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+// Init
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
+init(Reducer<OP, T, VOp>& red, RAJA::hip::detail::hipInfo& hi)
+{
+  red.devicetarget =
+      RAJA::hip::pinned_mempool_type::getInstance().template malloc<T>(1);
+  red.device_mem.allocate(hi.gridDim.x * hi.gridDim.y * hi.gridDim.z);
+  red.device_count = RAJA::hip::device_zeroed_mempool_type::getInstance()
+                         .template malloc<unsigned int>(1);
+}
+
+// Combine
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+RAJA_HOST_DEVICE camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
+combine(Reducer<OP, T, VOp>& red)
+{
+  RAJA::hip::impl::expt::grid_reduce<typename EXEC_POL::IterationGetter, OP>(
+      red.devicetarget, red.getVal(), red.device_mem, red.device_count);
+}
+
+// Resolve
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
+resolve(Reducer<OP, T, VOp>& red, RAJA::hip::detail::hipInfo& hi)
+{
+  // complete reduction
+  hi.res.wait();
+
+  red.combineTarget(*red.devicetarget);
+
+  // free memory
+  RAJA::hip::device_zeroed_mempool_type::getInstance().free(red.device_count);
+  red.device_count = nullptr;
+  red.device_mem.deallocate();
+  RAJA::hip::pinned_mempool_type::getInstance().free(red.devicetarget);
+  red.devicetarget = nullptr;
+}
+
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 #endif
 
-#endif //  NEW_REDUCE_HIP_REDUCE_HPP
+#endif  //  NEW_REDUCE_HIP_REDUCE_HPP
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index a9f9027675..2491f5dc05 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -38,7 +38,7 @@
 namespace RAJA
 {
 
-using hip_dim_t = dim3;
+using hip_dim_t        = dim3;
 using hip_dim_member_t = camp::decay<decltype(std::declval<hip_dim_t>().x)>;
 
 //
@@ -56,12 +56,14 @@ using hip_dim_member_t = camp::decay<decltype(std::declval<hip_dim_t>().x)>;
 namespace detail
 {
 template <bool Async>
-struct get_launch {
+struct get_launch
+{
   static constexpr RAJA::Launch value = RAJA::Launch::async;
 };
 
 template <>
-struct get_launch<false> {
+struct get_launch<false>
+{
   static constexpr RAJA::Launch value = RAJA::Launch::sync;
 };
 }  // end namespace detail
@@ -70,16 +72,16 @@ namespace hip
 {
 
 /// Type representing thread and block indexing within a grid
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct IndexGlobal;
 
-template<typename ...indexers>
+template <typename... indexers>
 struct IndexFlatten;
 
-template<size_t divisor, typename index>
+template <size_t divisor, typename index>
 struct IndexDivide;
 
-template<size_t divisor, typename index>
+template <size_t divisor, typename index>
 struct IndexModulo;
 
 
@@ -91,13 +93,14 @@ struct IndexModulo;
  */
 struct MaxOccupancyConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
-    IdxT device_sm_per_device = data.device_sm_per_device;
+    IdxT device_sm_per_device   = data.device_sm_per_device;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
 
-    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
+    IdxT func_max_blocks_per_device =
+        func_max_blocks_per_sm * device_sm_per_device;
 
     return func_max_blocks_per_device;
   }
@@ -110,26 +113,31 @@ struct MaxOccupancyConcretizer
  * maximum grid size:
  * (Fraction * kernel_max_blocks_per_sm + BLOCKS_PER_SM_OFFSET) * device_sm
  */
-template < typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
+template <typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
 struct FractionOffsetOccupancyConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
     using Fraction = typename t_Fraction::template rebind<IdxT>;
 
-    IdxT device_sm_per_device = data.device_sm_per_device;
+    IdxT device_sm_per_device   = data.device_sm_per_device;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
 
-    if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0)) {
+    if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0))
+    {
       func_max_blocks_per_sm = Fraction::multiply(func_max_blocks_per_sm);
     }
 
-    if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) > IdxT(0)) {
-      func_max_blocks_per_sm = IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
+    if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) >
+        IdxT(0))
+    {
+      func_max_blocks_per_sm =
+          IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
     }
 
-    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
+    IdxT func_max_blocks_per_device =
+        func_max_blocks_per_sm * device_sm_per_device;
 
     return func_max_blocks_per_device;
   }
@@ -143,22 +151,27 @@ struct FractionOffsetOccupancyConcretizer
  * Otherwise use the given AvoidMaxOccupancyCalculator to determine the
  * maximum grid size.
  */
-template < typename AvoidMaxOccupancyConcretizer >
+template <typename AvoidMaxOccupancyConcretizer>
 struct AvoidDeviceMaxThreadOccupancyConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
     IdxT device_max_threads_per_sm = data.device_max_threads_per_sm;
-    IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
-    IdxT func_threads_per_block = data.func_threads_per_block;
+    IdxT func_max_blocks_per_sm    = data.func_max_blocks_per_sm;
+    IdxT func_threads_per_block    = data.func_threads_per_block;
 
-    IdxT func_max_threads_per_sm = func_threads_per_block * func_max_blocks_per_sm;
+    IdxT func_max_threads_per_sm =
+        func_threads_per_block * func_max_blocks_per_sm;
 
-    if (func_max_threads_per_sm < device_max_threads_per_sm) {
+    if (func_max_threads_per_sm < device_max_threads_per_sm)
+    {
       return MaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
-    } else {
-      return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
+    }
+    else
+    {
+      return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(
+          data);
     }
   }
 };
@@ -167,10 +180,10 @@ struct AvoidDeviceMaxThreadOccupancyConcretizer
 /*!
  * Get an amount of replication that is preferred_replication.
  */
-template < size_t preferred_replication >
+template <size_t preferred_replication>
 struct ConstantPreferredReplicationConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_preferred_replication(Data const& RAJA_UNUSED_ARG(data))
   {
     return IdxT(preferred_replication);
@@ -182,19 +195,23 @@ struct ConstantPreferredReplicationConcretizer
  * data.func_threads_per_block is less than t_cutoff or
  * preferred_replication_after_cutoff otherwise.
  */
-template < size_t t_cutoff, size_t preferred_replication_before_cutoff,
-                            size_t preferred_replication_after_cutoff >
+template <size_t t_cutoff,
+          size_t preferred_replication_before_cutoff,
+          size_t preferred_replication_after_cutoff>
 struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_preferred_replication(Data const& data)
   {
-    IdxT cutoff = t_cutoff;
+    IdxT cutoff                 = t_cutoff;
     IdxT func_threads_per_block = data.func_threads_per_block;
 
-    if (func_threads_per_block < cutoff) {
+    if (func_threads_per_block < cutoff)
+    {
       return IdxT(preferred_replication_before_cutoff);
-    } else {
+    }
+    else
+    {
       return IdxT(preferred_replication_after_cutoff);
     }
   }
@@ -205,19 +222,21 @@ struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
  * most the amount given by data.func_max_shared_replication_per_block or the
  * amount given by GetPreferredReplication.
  */
-template < typename GetPreferredReplication >
+template <typename GetPreferredReplication>
 struct SharedAtomicReplicationMaxPow2Concretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_shared_replication(Data const& data)
   {
-    IdxT func_max_shared_replication_per_block = data.func_max_shared_replication_per_block;
+    IdxT func_max_shared_replication_per_block =
+        data.func_max_shared_replication_per_block;
 
-    IdxT preferred_replication = GetPreferredReplication{}.template
-        get_preferred_replication<IdxT>(data);
+    IdxT preferred_replication =
+        GetPreferredReplication {}.template get_preferred_replication<IdxT>(
+            data);
 
-    return prev_pow2(std::min(preferred_replication,
-                              func_max_shared_replication_per_block));
+    return prev_pow2(
+        std::min(preferred_replication, func_max_shared_replication_per_block));
   }
 };
 
@@ -226,18 +245,20 @@ struct SharedAtomicReplicationMaxPow2Concretizer
  * least the amount given by data.func_min_global_replication or the
  * amount given by GetPreferredReplication.
  */
-template < typename GetPreferredReplication >
+template <typename GetPreferredReplication>
 struct GlobalAtomicReplicationMinPow2Concretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_global_replication(Data const& data)
   {
     IdxT func_min_global_replication = data.func_min_global_replication;
 
-    IdxT preferred_replication = GetPreferredReplication{}.template
-        get_preferred_replication<IdxT>(data);
+    IdxT preferred_replication =
+        GetPreferredReplication {}.template get_preferred_replication<IdxT>(
+            data);
 
-    return next_pow2(std::max(preferred_replication, func_min_global_replication));
+    return next_pow2(
+        std::max(preferred_replication, func_min_global_replication));
   }
 };
 
@@ -255,14 +276,16 @@ enum struct block_communication_mode : int
   block_fence
 };
 
-template < reduce_algorithm t_algorithm, block_communication_mode t_comm_mode,
-           size_t t_replication, size_t t_atomic_stride >
+template <reduce_algorithm t_algorithm,
+          block_communication_mode t_comm_mode,
+          size_t t_replication,
+          size_t t_atomic_stride>
 struct ReduceTuning
 {
-  static constexpr reduce_algorithm algorithm = t_algorithm;
+  static constexpr reduce_algorithm algorithm         = t_algorithm;
   static constexpr block_communication_mode comm_mode = t_comm_mode;
-  static constexpr size_t replication = t_replication;
-  static constexpr size_t atomic_stride = t_atomic_stride;
+  static constexpr size_t replication                 = t_replication;
+  static constexpr size_t atomic_stride               = t_atomic_stride;
   static constexpr bool consistent =
       (algorithm == reduce_algorithm::combine_last_block);
 };
@@ -274,25 +297,25 @@ enum struct multi_reduce_algorithm : int
   init_host_combine_global_atomic
 };
 
-template < typename t_AtomicReplicationConcretizer,
-           typename t_ReplicationIndexer,
-           typename t_OffsetCalculator >
+template <typename t_AtomicReplicationConcretizer,
+          typename t_ReplicationIndexer,
+          typename t_OffsetCalculator>
 struct AtomicReplicationTuning
 {
   using AtomicReplicationConcretizer = t_AtomicReplicationConcretizer;
-  using ReplicationIndexer = t_ReplicationIndexer;
-  using OffsetCalculator = t_OffsetCalculator;
+  using ReplicationIndexer           = t_ReplicationIndexer;
+  using OffsetCalculator             = t_OffsetCalculator;
 };
 
-template < multi_reduce_algorithm t_algorithm,
-           typename t_SharedAtomicReplicationTuning,
-           typename t_GlobalAtomicReplicationTuning >
+template <multi_reduce_algorithm t_algorithm,
+          typename t_SharedAtomicReplicationTuning,
+          typename t_GlobalAtomicReplicationTuning>
 struct MultiReduceTuning
 {
   static constexpr multi_reduce_algorithm algorithm = t_algorithm;
   using SharedAtomicReplicationTuning = t_SharedAtomicReplicationTuning;
   using GlobalAtomicReplicationTuning = t_GlobalAtomicReplicationTuning;
-  static constexpr bool consistent = false;
+  static constexpr bool consistent    = false;
 };
 
 }  // namespace hip
@@ -307,16 +330,19 @@ struct DeviceConstants
   RAJA::Index_type WARP_SIZE;
   RAJA::Index_type MAX_BLOCK_SIZE;
   RAJA::Index_type MAX_WARPS;
-  RAJA::Index_type ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE; // basically the cache line size of the cache level that handles atomics
+  RAJA::Index_type
+      ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE;  // basically the cache line size of
+                                             // the cache level that handles
+                                             // atomics
 
   constexpr DeviceConstants(RAJA::Index_type warp_size,
                             RAJA::Index_type max_block_size,
                             RAJA::Index_type atomic_cache_line_bytes) noexcept
-    : WARP_SIZE(warp_size)
-    , MAX_BLOCK_SIZE(max_block_size)
-    , MAX_WARPS(max_block_size / warp_size)
-    , ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
-  { }
+      : WARP_SIZE(warp_size),
+        MAX_BLOCK_SIZE(max_block_size),
+        MAX_WARPS(max_block_size / warp_size),
+        ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
+  {}
 };
 
 //
@@ -324,49 +350,59 @@ struct DeviceConstants
 // values for HIP warp size and max block size.
 //
 #if defined(__HIP_PLATFORM_AMD__)
-constexpr DeviceConstants device_constants(64, 1024, 64); // MI300A
+constexpr DeviceConstants device_constants(64, 1024, 64);  // MI300A
 // constexpr DeviceConstants device_constants(64, 1024, 128); // MI250X
 #elif defined(__HIP_PLATFORM_NVIDIA__)
-constexpr DeviceConstants device_constants(32, 1024, 32); // V100
+constexpr DeviceConstants device_constants(32, 1024, 32);  // V100
 #endif
 static_assert(device_constants.WARP_SIZE >= device_constants.MAX_WARPS,
-              "RAJA Assumption Broken: device_constants.WARP_SIZE < device_constants.MAX_WARPS");
+              "RAJA Assumption Broken: device_constants.WARP_SIZE < "
+              "device_constants.MAX_WARPS");
 static_assert(device_constants.MAX_BLOCK_SIZE % device_constants.WARP_SIZE == 0,
               "RAJA Assumption Broken: device_constants.MAX_BLOCK_SIZE not "
               "a multiple of device_constants.WARP_SIZE");
 
 
-template <typename _IterationMapping, kernel_sync_requirement sync, typename ... _IterationGetters>
-struct hip_indexer {};
+template <typename _IterationMapping,
+          kernel_sync_requirement sync,
+          typename... _IterationGetters>
+struct hip_indexer
+{};
 
-template <typename _IterationMapping, kernel_sync_requirement sync, typename ... _IterationGetters>
+template <typename _IterationMapping,
+          kernel_sync_requirement sync,
+          typename... _IterationGetters>
 struct hip_flatten_indexer : public RAJA::make_policy_pattern_launch_platform_t<
-  RAJA::Policy::hip,
-  RAJA::Pattern::region,
-  detail::get_launch<true /*async */>::value,
-  RAJA::Platform::hip> {
+                                 RAJA::Policy::hip,
+                                 RAJA::Pattern::region,
+                                 detail::get_launch<true /*async */>::value,
+                                 RAJA::Platform::hip>
+{
   using IterationGetter = RAJA::hip::IndexFlatten<_IterationGetters...>;
 };
 
-template <typename _IterationMapping, typename _IterationGetter, typename _LaunchConcretizer,
+template <typename _IterationMapping,
+          typename _IterationGetter,
+          typename _LaunchConcretizer,
           bool Async = false>
 struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::hip,
-                       RAJA::Pattern::forall,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::hip> {
-  using IterationMapping = _IterationMapping;
-  using IterationGetter = _IterationGetter;
+                      RAJA::Policy::hip,
+                      RAJA::Pattern::forall,
+                      detail::get_launch<Async>::value,
+                      RAJA::Platform::hip>
+{
+  using IterationMapping  = _IterationMapping;
+  using IterationGetter   = _IterationGetter;
   using LaunchConcretizer = _LaunchConcretizer;
 };
 
 template <bool Async, int num_threads = named_usage::unspecified>
 struct hip_launch_t : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::hip,
-                       RAJA::Pattern::region,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::hip> {
-};
+                          RAJA::Policy::hip,
+                          RAJA::Pattern::region,
+                          detail::get_launch<Async>::value,
+                          RAJA::Platform::hip>
+{};
 
 
 //
@@ -378,11 +414,11 @@ struct hip_launch_t : public RAJA::make_policy_pattern_launch_platform_t<
 ///
 template <size_t BLOCK_SIZE, bool Async = false>
 struct hip_work : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::hip,
-                       RAJA::Pattern::workgroup_exec,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::hip> {
-};
+                      RAJA::Policy::hip,
+                      RAJA::Pattern::workgroup_exec,
+                      detail::get_launch<Async>::value,
+                      RAJA::Platform::hip>
+{};
 
 /// execute the enqueued loops in an unordered fashion by mapping loops to
 /// blocks in the y direction and loop iterations to threads in the x direction
@@ -390,10 +426,10 @@ struct hip_work : public RAJA::make_policy_pattern_launch_platform_t<
 /// of all the loops
 struct unordered_hip_loop_y_block_iter_x_threadblock_average
     : public RAJA::make_policy_pattern_platform_t<
-                       RAJA::Policy::hip,
-                       RAJA::Pattern::workgroup_order,
-                       RAJA::Platform::hip> {
-};
+          RAJA::Policy::hip,
+          RAJA::Pattern::workgroup_order,
+          RAJA::Platform::hip>
+{};
 
 
 ///
@@ -405,36 +441,36 @@ struct unordered_hip_loop_y_block_iter_x_threadblock_average
 ///
 
 
-template < typename tuning >
-struct hip_reduce_policy
-    : public RAJA::
-          make_policy_pattern_launch_platform_t<RAJA::Policy::hip,
-                                                RAJA::Pattern::reduce,
-                                                detail::get_launch<false>::value,
-                                                RAJA::Platform::hip,
-                                                std::conditional_t<tuning::consistent,
-                                                                   reduce::ordered,
-                                                                   reduce::unordered>> {
-};
+template <typename tuning>
+struct hip_reduce_policy : public RAJA::make_policy_pattern_launch_platform_t<
+                               RAJA::Policy::hip,
+                               RAJA::Pattern::reduce,
+                               detail::get_launch<false>::value,
+                               RAJA::Platform::hip,
+                               std::conditional_t<tuning::consistent,
+                                                  reduce::ordered,
+                                                  reduce::unordered>>
+{};
 
-template < typename tuning >
+template <typename tuning>
 struct hip_multi_reduce_policy
-    : public RAJA::
-          make_policy_pattern_launch_platform_t<RAJA::Policy::hip,
-                                                RAJA::Pattern::multi_reduce,
-                                                detail::get_launch<false>::value,
-                                                RAJA::Platform::hip,
-                                                std::conditional_t<tuning::consistent,
-                                                                   reduce::ordered,
-                                                                   reduce::unordered>> {
-};
+    : public RAJA::make_policy_pattern_launch_platform_t<
+          RAJA::Policy::hip,
+          RAJA::Pattern::multi_reduce,
+          detail::get_launch<false>::value,
+          RAJA::Platform::hip,
+          std::conditional_t<tuning::consistent,
+                             reduce::ordered,
+                             reduce::unordered>>
+{};
 
 /*!
  * Hip atomic policy for using hip atomics on the device and
  * the provided policy on the host
  */
-template<typename host_policy>
-struct hip_atomic_explicit{};
+template <typename host_policy>
+struct hip_atomic_explicit
+{};
 
 /*!
  * Default hip atomic policy uses hip atomics on the device and non-atomics
@@ -445,11 +481,13 @@ using hip_atomic = hip_atomic_explicit<seq_atomic>;
 
 // Policy for RAJA::statement::Reduce that reduces threads in a block
 // down to threadIdx 0
-struct hip_block_reduce{};
+struct hip_block_reduce
+{};
 
 // Policy for RAJA::statement::Reduce that reduces threads in a warp
 // down to the first lane of the warp
-struct hip_warp_reduce{};
+struct hip_warp_reduce
+{};
 
 // Policy to map work directly to threads within a warp
 // Maximum iteration count is WARP_SIZE
@@ -463,15 +501,15 @@ struct hip_warp_reduce{};
 // struct hip_warp_loop{};
 
 
-
 // Policy to map work to threads within a warp using a bit mask
 // Cannot be used in conjunction with hip_thread_x_*
 // Multiple warps have to be created by using hip_thread_{yz}_*
 // Since we are masking specific threads, multiple nested
 // hip_warp_masked
 // can be used to create complex thread interleaving patterns
-template<typename Mask>
-struct hip_warp_masked_direct {};
+template <typename Mask>
+struct hip_warp_masked_direct
+{};
 
 // Policy to map work to threads within a warp using a bit mask
 // Cannot be used in conjunction with hip_thread_x_*
@@ -479,21 +517,24 @@ struct hip_warp_masked_direct {};
 // Since we are masking specific threads, multiple nested
 // hip_warp_masked
 // can be used to create complex thread interleaving patterns
-template<typename Mask>
-struct hip_warp_masked_loop {};
+template <typename Mask>
+struct hip_warp_masked_loop
+{};
 
 
-template<typename Mask>
-struct hip_thread_masked_direct {};
+template <typename Mask>
+struct hip_thread_masked_direct
+{};
 
-template<typename Mask>
-struct hip_thread_masked_loop {};
+template <typename Mask>
+struct hip_thread_masked_loop
+{};
 
 
 struct hip_synchronize : make_policy_pattern_launch_t<Policy::hip,
-                                                       Pattern::synchronize,
-                                                       Launch::sync> {
-};
+                                                      Pattern::synchronize,
+                                                      Launch::sync>
+{};
 
 }  // end namespace hip
 }  // end namespace policy
@@ -505,141 +546,131 @@ namespace internal
 RAJA_INLINE
 int get_size(hip_dim_t dims)
 {
-  if(dims.x == 0 && dims.y == 0 && dims.z == 0){
+  if (dims.x == 0 && dims.y == 0 && dims.z == 0)
+  {
     return 0;
   }
-  return (dims.x ? dims.x : 1) *
-         (dims.y ? dims.y : 1) *
-         (dims.z ? dims.z : 1);
+  return (dims.x ? dims.x : 1) * (dims.y ? dims.y : 1) * (dims.z ? dims.z : 1);
 }
 
-struct HipDims {
+struct HipDims
+{
 
-  hip_dim_t blocks{0,0,0};
-  hip_dim_t threads{0,0,0};
+  hip_dim_t blocks {0, 0, 0};
+  hip_dim_t threads {0, 0, 0};
 
-  HipDims() = default;
-  HipDims(HipDims const&) = default;
+  HipDims()                          = default;
+  HipDims(HipDims const&)            = default;
   HipDims& operator=(HipDims const&) = default;
 
   RAJA_INLINE
   HipDims(hip_dim_member_t default_val)
-    : blocks{default_val, default_val, default_val}
-    , threads{default_val, default_val, default_val}
-  { }
+      : blocks {default_val, default_val, default_val},
+        threads {default_val, default_val, default_val}
+  {}
 
   RAJA_INLINE
-  int num_blocks() const {
-    return get_size(blocks);
-  }
+  int num_blocks() const { return get_size(blocks); }
 
   RAJA_INLINE
-  int num_threads() const {
-    return get_size(threads);
-  }
+  int num_threads() const { return get_size(threads); }
 
   RAJA_INLINE
-  hip_dim_t get_blocks() const {
-    if (num_blocks() != 0) {
-      return {(blocks.x ? blocks.x : 1),
-              (blocks.y ? blocks.y : 1),
+  hip_dim_t get_blocks() const
+  {
+    if (num_blocks() != 0)
+    {
+      return {(blocks.x ? blocks.x : 1), (blocks.y ? blocks.y : 1),
               (blocks.z ? blocks.z : 1)};
-    } else {
+    }
+    else
+    {
       return blocks;
     }
   }
 
   RAJA_INLINE
-  hip_dim_t get_threads() const {
-    if (num_threads() != 0) {
-      return {(threads.x ? threads.x : 1),
-              (threads.y ? threads.y : 1),
+  hip_dim_t get_threads() const
+  {
+    if (num_threads() != 0)
+    {
+      return {(threads.x ? threads.x : 1), (threads.y ? threads.y : 1),
               (threads.z ? threads.z : 1)};
-    } else {
+    }
+    else
+    {
       return threads;
     }
   }
 };
 
-template<named_dim dim>
+template <named_dim dim>
 struct HipDimHelper;
 
-template<>
-struct HipDimHelper<named_dim::x>{
+template <>
+struct HipDimHelper<named_dim::x>
+{
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  hip_dim_member_t get(dim_t const &d)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static constexpr hip_dim_member_t get(dim_t const& d)
   {
     return d.x;
   }
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, hip_dim_member_t value)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, hip_dim_member_t value)
   {
     d.x = value;
   }
 };
 
-template<>
-struct HipDimHelper<named_dim::y>{
+template <>
+struct HipDimHelper<named_dim::y>
+{
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  hip_dim_member_t get(dim_t const &d)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static constexpr hip_dim_member_t get(dim_t const& d)
   {
     return d.y;
   }
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, hip_dim_member_t value)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, hip_dim_member_t value)
   {
     d.y = value;
   }
 };
 
-template<>
-struct HipDimHelper<named_dim::z>{
+template <>
+struct HipDimHelper<named_dim::z>
+{
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  hip_dim_member_t get(dim_t const &d)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static constexpr hip_dim_member_t get(dim_t const& d)
   {
     return d.z;
   }
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, hip_dim_member_t value)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, hip_dim_member_t value)
   {
     d.z = value;
   }
 };
 
-template<named_dim dim, typename dim_t>
-RAJA_HOST_DEVICE
-constexpr
-hip_dim_member_t get_hip_dim(dim_t const &d)
+template <named_dim dim, typename dim_t>
+RAJA_HOST_DEVICE constexpr hip_dim_member_t get_hip_dim(dim_t const& d)
 {
   return HipDimHelper<dim>::get(d);
 }
 
-template<named_dim dim, typename dim_t>
-RAJA_HOST_DEVICE
-void set_hip_dim(dim_t &d, hip_dim_member_t value)
+template <named_dim dim, typename dim_t>
+RAJA_HOST_DEVICE void set_hip_dim(dim_t& d, hip_dim_member_t value)
 {
   return HipDimHelper<dim>::set(d, value);
 }
 
-} // namespace internal
+}  // namespace internal
 
 namespace hip
 {
@@ -648,14 +679,13 @@ namespace hip
 struct IndexSize
 {
   hip_dim_member_t block_size = named_usage::unspecified;
-  hip_dim_member_t grid_size = named_usage::unspecified;
-
-  RAJA_HOST_DEVICE constexpr
-  IndexSize(hip_dim_member_t _block_size = named_usage::unspecified,
-            hip_dim_member_t _grid_size = named_usage::unspecified)
-    : block_size(_block_size)
-    , grid_size(_grid_size)
-  { }
+  hip_dim_member_t grid_size  = named_usage::unspecified;
+
+  RAJA_HOST_DEVICE constexpr IndexSize(
+      hip_dim_member_t _block_size = named_usage::unspecified,
+      hip_dim_member_t _grid_size  = named_usage::unspecified)
+      : block_size(_block_size), grid_size(_grid_size)
+  {}
 };
 
 /// Type representing thread indexing within a grid
@@ -663,436 +693,457 @@ struct IndexSize
 
 /// useful for global indexing
 /// with fixed block size and fixed grid size
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct IndexGlobal
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
            static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+               static_cast<IdxT>(
+                   ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(block_size) * static_cast<IdxT>(grid_size);
   }
 };
 /// with fixed block size of 1 and fixed grid size
-template<named_dim dim, int GRID_SIZE>
+template <named_dim dim, int GRID_SIZE>
 struct IndexGlobal<dim, 1, GRID_SIZE>
 {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = 1;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(grid_size);
   }
 };
 /// with fixed block size and fixed grid size of 1
-template<named_dim dim, int BLOCK_SIZE>
+template <named_dim dim, int BLOCK_SIZE>
 struct IndexGlobal<dim, BLOCK_SIZE, 1>
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) ;
+    return static_cast<IdxT>(block_size);
   }
 };
 /// with fixed block size and fixed grid size of 1
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, 1, 1>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 
 /// with dynamic block size and fixed grid size
-template<named_dim dim, int GRID_SIZE>
+template <named_dim dim, int GRID_SIZE>
 struct IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>
 {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
+           static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
+               static_cast<IdxT>(
+                   ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
+           static_cast<IdxT>(grid_size);
   }
 };
 /// with dynamic block size and fixed grid size of 1
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, 1>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockDim));
   }
 };
 
 /// with fixed block size and dynamic grid size
-template<named_dim dim, int BLOCK_SIZE>
+template <named_dim dim, int BLOCK_SIZE>
 struct IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
            static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+               static_cast<IdxT>(
+                   ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
     return static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim)) ;
+           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim));
   }
 };
 /// with fixed block size of 1 and dynamic grid size
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, 1, named_usage::unspecified>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim));
   }
 };
 
 /// with dynamic block size and dynamic grid size
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
+           static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
+               static_cast<IdxT>(
+                   ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
+           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim));
   }
 };
 
 /// useful for indexing blocks (ignores thread indices)
 /// with fixed grid size
-template<named_dim dim, int GRID_SIZE>
+template <named_dim dim, int GRID_SIZE>
 struct IndexGlobal<dim, named_usage::ignored, GRID_SIZE>
 {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(grid_size);
   }
 };
 /// with fixed grid sized of 1
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, 1>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 /// with dynamic grid size
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim));
   }
 };
 
 /// useful for indexing threads (ignores block indices)
 /// with fixed block size
-template<named_dim dim, int BLOCK_SIZE>
+template <named_dim dim, int BLOCK_SIZE>
 struct IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) ;
+    return static_cast<IdxT>(block_size);
   }
 };
 /// with fixed block size of 1
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, 1, named_usage::ignored>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 /// with dynamic block size
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockDim));
   }
 };
 
 /// useful for doing single threaded sequential tasks
 /// (ignores thread and block indices)
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, named_usage::ignored>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 
 // useful for flatten global index (includes x)
-template<typename x_index>
+template <typename x_index>
 struct IndexFlatten<x_index>
 {
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>();
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>();
+    return x_index::template size<IdxT>();
   }
-
 };
 
 // useful for flatten global index (includes x,y)
-template<typename x_index, typename y_index>
+template <typename x_index, typename y_index>
 struct IndexFlatten<x_index, y_index>
 {
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>() +
-      x_index::template size<IdxT>() * ( y_index::template index<IdxT>());
-
+           x_index::template size<IdxT>() * (y_index::template index<IdxT>());
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>() * y_index::template size<IdxT> ();
+    return x_index::template size<IdxT>() * y_index::template size<IdxT>();
   }
-
 };
 
 // useful for flatten global index (includes x,y,z)
-template<typename x_index, typename y_index, typename z_index>
+template <typename x_index, typename y_index, typename z_index>
 struct IndexFlatten<x_index, y_index, z_index>
 {
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>() +
-      x_index::template size<IdxT>() * ( y_index::template index<IdxT>() +
-                                         y_index::template size<IdxT>() * z_index::template index<IdxT>());
+           x_index::template size<IdxT>() *
+               (y_index::template index<IdxT>() +
+                y_index::template size<IdxT>() *
+                    z_index::template index<IdxT>());
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>() * y_index::template size<IdxT> () * z_index::template size<IdxT> ();
+    return x_index::template size<IdxT>() * y_index::template size<IdxT>() *
+           z_index::template size<IdxT>();
   }
-
 };
 
-template<size_t divisor, typename indexer>
+template <size_t divisor, typename indexer>
 struct IndexDivide
 {
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
     return indexer::template index<IdxT>() / static_cast<IdxT>(divisor);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(), static_cast<IdxT>(divisor));
+    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(),
+                                   static_cast<IdxT>(divisor));
   }
 };
 
-template<size_t divisor, typename indexer>
+template <size_t divisor, typename indexer>
 struct IndexModulo
 {
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
     return indexer::template index<IdxT>() % static_cast<IdxT>(divisor);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
     return static_cast<IdxT>(divisor);
@@ -1101,10 +1152,10 @@ struct IndexModulo
 
 
 // helper to get just the thread indexing part of IndexGlobal
-template < typename index_global >
+template <typename index_global>
 struct get_index_thread;
 ///
-template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct get_index_thread<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
 {
   using type = IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
@@ -1119,10 +1170,10 @@ struct get_index_thread<IndexFlatten<x_index, y_index, z_index>>
 };
 
 // helper to get just the block indexing part of IndexGlobal
-template < typename index_global >
+template <typename index_global>
 struct get_index_block;
 ///
-template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct get_index_block<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
 {
   using type = IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
@@ -1137,89 +1188,88 @@ struct get_index_block<IndexFlatten<x_index, y_index, z_index>>
 };
 
 
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_x = IndexGlobal<named_dim::x, BLOCK_SIZE, named_usage::ignored>;
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_y = IndexGlobal<named_dim::y, BLOCK_SIZE, named_usage::ignored>;
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_z = IndexGlobal<named_dim::z, BLOCK_SIZE, named_usage::ignored>;
 
-template <size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified>
+template <size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified>
 using thread_xyz = IndexFlatten<thread_x<BLOCK_SIZE_X>,
                                 thread_y<BLOCK_SIZE_Y>,
                                 thread_z<BLOCK_SIZE_Z>>;
 
-template <size_t GRID_SIZE=named_usage::unspecified>
+template <size_t GRID_SIZE = named_usage::unspecified>
 using block_x = IndexGlobal<named_dim::x, named_usage::ignored, GRID_SIZE>;
-template <size_t GRID_SIZE=named_usage::unspecified>
+template <size_t GRID_SIZE = named_usage::unspecified>
 using block_y = IndexGlobal<named_dim::y, named_usage::ignored, GRID_SIZE>;
-template <size_t GRID_SIZE=named_usage::unspecified>
+template <size_t GRID_SIZE = named_usage::unspecified>
 using block_z = IndexGlobal<named_dim::z, named_usage::ignored, GRID_SIZE>;
 
-template <size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
+template <size_t GRID_SIZE_X = named_usage::unspecified,
+          size_t GRID_SIZE_Y = named_usage::unspecified,
+          size_t GRID_SIZE_Z = named_usage::unspecified>
 using block_xyz = IndexFlatten<block_x<GRID_SIZE_X>,
                                block_y<GRID_SIZE_Y>,
                                block_z<GRID_SIZE_Z>>;
 
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_x = IndexGlobal<named_dim::x, BLOCK_SIZE, GRID_SIZE>;
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_y = IndexGlobal<named_dim::y, BLOCK_SIZE, GRID_SIZE>;
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 
 
 template <size_t BLOCK_SIZE_X,
           size_t BLOCK_SIZE_Y,
           size_t BLOCK_SIZE_Z,
-          size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
+          size_t GRID_SIZE_X = named_usage::unspecified,
+          size_t GRID_SIZE_Y = named_usage::unspecified,
+          size_t GRID_SIZE_Z = named_usage::unspecified>
 using global_xyz = IndexFlatten<global_x<BLOCK_SIZE_X, GRID_SIZE_X>,
                                 global_y<BLOCK_SIZE_Y, GRID_SIZE_Y>,
                                 global_z<BLOCK_SIZE_Z, GRID_SIZE_Z>>;
 
 
-template <size_t WARP_SIZE=RAJA::policy::hip::device_constants.WARP_SIZE,
-          size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified>
-using warp_xyz = IndexDivide<WARP_SIZE,
-                             thread_xyz<BLOCK_SIZE_X,
-                                        BLOCK_SIZE_Y,
-                                        BLOCK_SIZE_Z>>;
-
-template <size_t WARP_SIZE=RAJA::policy::hip::device_constants.WARP_SIZE,
-          size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified,
-          size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
-using warp_global_xyz = IndexFlatten<warp_xyz<WARP_SIZE,
-                                              BLOCK_SIZE_X,
-                                              BLOCK_SIZE_Y,
-                                              BLOCK_SIZE_Z>,
-                                     block_xyz<GRID_SIZE_X,
-                                               GRID_SIZE_Y,
-                                               GRID_SIZE_Z>>;
-
-} // namespace hip
+template <size_t WARP_SIZE    = RAJA::policy::hip::device_constants.WARP_SIZE,
+          size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified>
+using warp_xyz =
+    IndexDivide<WARP_SIZE,
+                thread_xyz<BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>>;
+
+template <size_t WARP_SIZE    = RAJA::policy::hip::device_constants.WARP_SIZE,
+          size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified,
+          size_t GRID_SIZE_X  = named_usage::unspecified,
+          size_t GRID_SIZE_Y  = named_usage::unspecified,
+          size_t GRID_SIZE_Z  = named_usage::unspecified>
+using warp_global_xyz =
+    IndexFlatten<warp_xyz<WARP_SIZE, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>,
+                 block_xyz<GRID_SIZE_X, GRID_SIZE_Y, GRID_SIZE_Z>>;
+
+}  // namespace hip
 
 // contretizers used in forall, scan, and sort policies
 
-using HipAvoidDeviceMaxThreadOccupancyConcretizer = hip::AvoidDeviceMaxThreadOccupancyConcretizer<hip::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
+using HipAvoidDeviceMaxThreadOccupancyConcretizer =
+    hip::AvoidDeviceMaxThreadOccupancyConcretizer<
+        hip::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
 
-template < typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
-using HipFractionOffsetOccupancyConcretizer = hip::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
+template <typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
+using HipFractionOffsetOccupancyConcretizer =
+    hip::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
 
 using HipMaxOccupancyConcretizer = hip::MaxOccupancyConcretizer;
 
-using HipReduceDefaultConcretizer = HipFractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 2>, 0>;
+using HipReduceDefaultConcretizer =
+    HipFractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 2>, 0>;
 
 using HipDefaultConcretizer = HipAvoidDeviceMaxThreadOccupancyConcretizer;
 
@@ -1227,83 +1277,111 @@ using HipDefaultConcretizer = HipAvoidDeviceMaxThreadOccupancyConcretizer;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, bool Async = false>
 using hip_exec_grid = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE, GRID_SIZE>,
-    HipDefaultConcretizer, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE, GRID_SIZE>,
+    HipDefaultConcretizer,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE>
 using hip_exec_grid_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE, GRID_SIZE>,
-    HipDefaultConcretizer, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE, GRID_SIZE>,
+    HipDefaultConcretizer,
+    true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
-using hip_exec = policy::hip::hip_exec<
-    iteration_mapping::Direct, hip::global_x<BLOCK_SIZE>,
-    HipDefaultConcretizer, Async>;
+using hip_exec = policy::hip::hip_exec<iteration_mapping::Direct,
+                                       hip::global_x<BLOCK_SIZE>,
+                                       HipDefaultConcretizer,
+                                       Async>;
 
 template <size_t BLOCK_SIZE>
-using hip_exec_async = policy::hip::hip_exec<
-    iteration_mapping::Direct, hip::global_x<BLOCK_SIZE>,
-    HipDefaultConcretizer, true>;
+using hip_exec_async = policy::hip::hip_exec<iteration_mapping::Direct,
+                                             hip::global_x<BLOCK_SIZE>,
+                                             HipDefaultConcretizer,
+                                             true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using hip_exec_occ_calc = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipDefaultConcretizer, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipDefaultConcretizer,
+    Async>;
 
 template <size_t BLOCK_SIZE>
 using hip_exec_occ_calc_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipDefaultConcretizer, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipDefaultConcretizer,
+    true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using hip_exec_occ_max = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipMaxOccupancyConcretizer, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipMaxOccupancyConcretizer,
+    Async>;
 
 template <size_t BLOCK_SIZE>
 using hip_exec_occ_max_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipMaxOccupancyConcretizer, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipMaxOccupancyConcretizer,
+    true>;
 
 template <size_t BLOCK_SIZE, typename Fraction, bool Async = false>
 using hip_exec_occ_fraction = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipFractionOffsetOccupancyConcretizer<Fraction, 0>, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    Async>;
 
 template <size_t BLOCK_SIZE, typename Fraction>
 using hip_exec_occ_fraction_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipFractionOffsetOccupancyConcretizer<Fraction, 0>, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    true>;
 
 template <size_t BLOCK_SIZE, typename Concretizer, bool Async = false>
 using hip_exec_occ_custom = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    Concretizer, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    Concretizer,
+    Async>;
 
 template <size_t BLOCK_SIZE, typename Concretizer>
 using hip_exec_occ_custom_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    Concretizer, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    Concretizer,
+    true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using hip_exec_with_reduce = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipReduceDefaultConcretizer, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipReduceDefaultConcretizer,
+    Async>;
 
 template <size_t BLOCK_SIZE>
 using hip_exec_with_reduce_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipReduceDefaultConcretizer, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipReduceDefaultConcretizer,
+    true>;
 
 template <bool with_reduce, size_t BLOCK_SIZE, bool Async = false>
-using hip_exec_base = std::conditional_t<with_reduce,
-    hip_exec_with_reduce<BLOCK_SIZE, Async>,
-    hip_exec<BLOCK_SIZE, Async>>;
+using hip_exec_base =
+    std::conditional_t<with_reduce,
+                       hip_exec_with_reduce<BLOCK_SIZE, Async>,
+                       hip_exec<BLOCK_SIZE, Async>>;
 
 template <bool with_reduce, size_t BLOCK_SIZE>
-using hip_exec_base_async = std::conditional_t<with_reduce,
-    hip_exec_with_reduce_async<BLOCK_SIZE>,
-    hip_exec_async<BLOCK_SIZE>>;
+using hip_exec_base_async =
+    std::conditional_t<with_reduce,
+                       hip_exec_with_reduce_async<BLOCK_SIZE>,
+                       hip_exec_async<BLOCK_SIZE>>;
 
 // policies usable with WorkGroup
 using policy::hip::hip_work;
@@ -1319,10 +1397,10 @@ using policy::hip::hip_atomic_explicit;
 
 
 // policies usable with reducers
-template < hip::reduce_algorithm algorithm,
-           hip::block_communication_mode comm_mode,
-           size_t replication = named_usage::unspecified,
-           size_t atomic_stride = named_usage::unspecified >
+template <hip::reduce_algorithm algorithm,
+          hip::block_communication_mode comm_mode,
+          size_t replication   = named_usage::unspecified,
+          size_t atomic_stride = named_usage::unspecified>
 using hip_reduce_tuning = policy::hip::hip_reduce_policy<
     hip::ReduceTuning<algorithm, comm_mode, replication, atomic_stride>>;
 
@@ -1345,35 +1423,41 @@ using hip_reduce_tuning = policy::hip::hip_reduce_policy<
 //                 a cache shared by the whole device to avoid having to use
 //                 device scope fences. This improves performance on some HW but
 //                 is more difficult to code correctly.
-using hip_reduce_device_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::combine_last_block,
-    hip::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_device_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::combine_last_block,
+                      hip::block_communication_mode::device_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_block_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::combine_last_block,
-    hip::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_block_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::combine_last_block,
+                      hip::block_communication_mode::block_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_atomic_device_init_device_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::init_device_combine_atomic_block,
-    hip::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_atomic_device_init_device_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::init_device_combine_atomic_block,
+                      hip::block_communication_mode::device_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_atomic_device_init_block_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::init_device_combine_atomic_block,
-    hip::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_atomic_device_init_block_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::init_device_combine_atomic_block,
+                      hip::block_communication_mode::block_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_atomic_host_init_device_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::init_host_combine_atomic_block,
-    hip::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_atomic_host_init_device_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::init_host_combine_atomic_block,
+                      hip::block_communication_mode::device_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_atomic_host_init_block_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::init_host_combine_atomic_block,
-    hip::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_atomic_host_init_block_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::init_host_combine_atomic_block,
+                      hip::block_communication_mode::block_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 
 // Policy for RAJA::Reduce* objects that gives the same answer every time when
 // used in the same way
@@ -1385,25 +1469,26 @@ using hip_reduce_atomic = hip_reduce_atomic_host_init_block_fence;
 
 // Policy for RAJA::Reduce* objects that lets you select the default atomic or
 // non-atomic policy with a bool
-template < bool with_atomic >
-using hip_reduce_base = std::conditional_t<with_atomic, hip_reduce_atomic, hip_reduce>;
+template <bool with_atomic>
+using hip_reduce_base =
+    std::conditional_t<with_atomic, hip_reduce_atomic, hip_reduce>;
 
 
 // policies usable with multi_reducers
-template < hip::multi_reduce_algorithm algorithm,
-           typename SharedAtomicReplicationConcretizer,
-           typename SharedAtomicReplicationIndexer,
-           typename GlobalAtomicReplicationConcretizer,
-           typename GlobalAtomicReplicationIndexer >
-using hip_multi_reduce_tuning = policy::hip::hip_multi_reduce_policy<
-    hip::MultiReduceTuning<
-      algorithm,
-      hip::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
-                                    SharedAtomicReplicationIndexer,
-                                    GetOffsetRight<int>>,
-      hip::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
-                                    GlobalAtomicReplicationIndexer,
-                                    GetOffsetLeft<int>>>>;
+template <hip::multi_reduce_algorithm algorithm,
+          typename SharedAtomicReplicationConcretizer,
+          typename SharedAtomicReplicationIndexer,
+          typename GlobalAtomicReplicationConcretizer,
+          typename GlobalAtomicReplicationIndexer>
+using hip_multi_reduce_tuning =
+    policy::hip::hip_multi_reduce_policy<hip::MultiReduceTuning<
+        algorithm,
+        hip::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
+                                     SharedAtomicReplicationIndexer,
+                                     GetOffsetRight<int>>,
+        hip::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
+                                     GlobalAtomicReplicationIndexer,
+                                     GetOffsetLeft<int>>>>;
 
 // Policies for RAJA::MultiReduce* objects with specific behaviors.
 // - *atomic* policies may use atomics to combine partial results. The
@@ -1416,44 +1501,51 @@ using hip_multi_reduce_tuning = policy::hip::hip_multi_reduce_policy<
 // - *host_init* policies initialize memory used with atomics on the host.
 //   This is faster overall than other policies on HW with direct host access
 //   to device memory such as the AMD MI300A El Capitan/Tuolumne systems.
-using hip_multi_reduce_atomic_block_then_atomic_grid_host_init = hip_multi_reduce_tuning<
-    hip::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
-    hip::SharedAtomicReplicationMaxPow2Concretizer<
-        hip::ConstantPreferredReplicationConcretizer<4>>,
-    hip::thread_xyz<>,
-    hip::GlobalAtomicReplicationMinPow2Concretizer<
-        hip::ConstantPreferredReplicationConcretizer<32>>,
-    hip::warp_global_xyz<>>;
+using hip_multi_reduce_atomic_block_then_atomic_grid_host_init =
+    hip_multi_reduce_tuning<
+        hip::multi_reduce_algorithm::
+            init_host_combine_block_atomic_then_grid_atomic,
+        hip::SharedAtomicReplicationMaxPow2Concretizer<
+            hip::ConstantPreferredReplicationConcretizer<4>>,
+        hip::thread_xyz<>,
+        hip::GlobalAtomicReplicationMinPow2Concretizer<
+            hip::ConstantPreferredReplicationConcretizer<32>>,
+        hip::warp_global_xyz<>>;
 // special policy to test that multi-reducers work if there is not enough shmem
-using hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing = hip_multi_reduce_tuning<
-    hip::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
-    hip::SharedAtomicReplicationMaxPow2Concretizer<
-        hip::ConstantPreferredReplicationConcretizer<0>>,
-    hip::thread_xyz<>,
-    hip::GlobalAtomicReplicationMinPow2Concretizer<
-        hip::ConstantPreferredReplicationConcretizer<32>>,
-    hip::warp_global_xyz<>>;
+using hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing =
+    hip_multi_reduce_tuning<
+        hip::multi_reduce_algorithm::
+            init_host_combine_block_atomic_then_grid_atomic,
+        hip::SharedAtomicReplicationMaxPow2Concretizer<
+            hip::ConstantPreferredReplicationConcretizer<0>>,
+        hip::thread_xyz<>,
+        hip::GlobalAtomicReplicationMinPow2Concretizer<
+            hip::ConstantPreferredReplicationConcretizer<32>>,
+        hip::warp_global_xyz<>>;
 //
 using hip_multi_reduce_atomic_global_host_init = hip_multi_reduce_tuning<
     hip::multi_reduce_algorithm::init_host_combine_global_atomic,
-    void, // unused with this algorithm
-    void, // unused with this algorithm
+    void,  // unused with this algorithm
+    void,  // unused with this algorithm
     hip::GlobalAtomicReplicationMinPow2Concretizer<
         hip::ConstantPreferredReplicationConcretizer<32>>,
     hip::warp_global_xyz<>>;
 //
-using hip_multi_reduce_atomic_global_no_replication_host_init = hip_multi_reduce_tuning<
-    hip::multi_reduce_algorithm::init_host_combine_global_atomic,
-    void, // unused with this algorithm
-    void, // unused with this algorithm
-    hip::GlobalAtomicReplicationMinPow2Concretizer<
-        hip::ConstantPreferredReplicationConcretizer<1>>,
-    hip::block_xyz<>>;
-
-// Policy for RAJA::MultiReduce* objects that may use atomics and may not give the
-// same answer every time when used in the same way
-using hip_multi_reduce_atomic = hip_multi_reduce_atomic_block_then_atomic_grid_host_init;
-// Similar to above but optimized for low overhead in cases where it is rarely used
+using hip_multi_reduce_atomic_global_no_replication_host_init =
+    hip_multi_reduce_tuning<
+        hip::multi_reduce_algorithm::init_host_combine_global_atomic,
+        void,  // unused with this algorithm
+        void,  // unused with this algorithm
+        hip::GlobalAtomicReplicationMinPow2Concretizer<
+            hip::ConstantPreferredReplicationConcretizer<1>>,
+        hip::block_xyz<>>;
+
+// Policy for RAJA::MultiReduce* objects that may use atomics and may not give
+// the same answer every time when used in the same way
+using hip_multi_reduce_atomic =
+    hip_multi_reduce_atomic_block_then_atomic_grid_host_init;
+// Similar to above but optimized for low overhead in cases where it is rarely
+// used
 using hip_multi_reduce_atomic_low_performance_low_overhead =
     hip_multi_reduce_atomic_global_no_replication_host_init;
 
@@ -1485,31 +1577,31 @@ using policy::hip::hip_launch_t;
 
 
 // policies usable with kernel and launch
-template < typename ... indexers >
-using hip_indexer_direct = policy::hip::hip_indexer<
-    iteration_mapping::Direct,
-    kernel_sync_requirement::none,
-    indexers...>;
+template <typename... indexers>
+using hip_indexer_direct =
+    policy::hip::hip_indexer<iteration_mapping::Direct,
+                             kernel_sync_requirement::none,
+                             indexers...>;
 
-template < typename ... indexers >
+template <typename... indexers>
 using hip_indexer_loop = policy::hip::hip_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
     indexers...>;
 
-template < typename ... indexers >
+template <typename... indexers>
 using hip_indexer_syncable_loop = policy::hip::hip_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::sync,
     indexers...>;
 
-template < typename ... indexers >
-using hip_flatten_indexer_direct = policy::hip::hip_flatten_indexer<
-    iteration_mapping::Direct,
-    kernel_sync_requirement::none,
-    indexers...>;
+template <typename... indexers>
+using hip_flatten_indexer_direct =
+    policy::hip::hip_flatten_indexer<iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     indexers...>;
 
-template < typename ... indexers >
+template <typename... indexers>
 using hip_flatten_indexer_loop = policy::hip::hip_flatten_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
@@ -1522,7 +1614,7 @@ using hip_flatten_indexer_loop = policy::hip::hip_flatten_indexer<
  * For example, a segment of size 2000 will not fit, and trigger a runtime
  * error.
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_thread_direct = hip_indexer_direct<
     hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1537,22 +1629,28 @@ using hip_thread_yz_direct = hip_thread_direct<named_dim::y, named_dim::z>;
 using hip_thread_zx_direct = hip_thread_direct<named_dim::z, named_dim::x>;
 using hip_thread_zy_direct = hip_thread_direct<named_dim::z, named_dim::y>;
 
-using hip_thread_xyz_direct = hip_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_thread_xzy_direct = hip_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_thread_yxz_direct = hip_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_thread_yzx_direct = hip_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_thread_zxy_direct = hip_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_thread_zyx_direct = hip_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_thread_xyz_direct =
+    hip_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_thread_xzy_direct =
+    hip_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_thread_yxz_direct =
+    hip_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_thread_yzx_direct =
+    hip_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_thread_zxy_direct =
+    hip_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_thread_zyx_direct =
+    hip_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to HIP threads.
  * Uses block-stride looping to exceed the maximum number of physical threads
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_thread_loop = hip_indexer_loop<
     hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_thread_syncable_loop = hip_indexer_syncable_loop<
     hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1567,12 +1665,18 @@ using hip_thread_yz_loop = hip_thread_loop<named_dim::y, named_dim::z>;
 using hip_thread_zx_loop = hip_thread_loop<named_dim::z, named_dim::x>;
 using hip_thread_zy_loop = hip_thread_loop<named_dim::z, named_dim::y>;
 
-using hip_thread_xyz_loop = hip_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_thread_xzy_loop = hip_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_thread_yxz_loop = hip_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_thread_yzx_loop = hip_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_thread_zxy_loop = hip_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_thread_zyx_loop = hip_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_thread_xyz_loop =
+    hip_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_thread_xzy_loop =
+    hip_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_thread_yxz_loop =
+    hip_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_thread_yzx_loop =
+    hip_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_thread_zxy_loop =
+    hip_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_thread_zyx_loop =
+    hip_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP threads.
@@ -1580,7 +1684,7 @@ using hip_thread_zyx_loop = hip_thread_loop<named_dim::z, named_dim::y, named_di
  * physical threads to fit all of the direct map requests.
  * Reshapes multiple physical threads into a 1D iteration space
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_flatten_thread_direct = hip_flatten_indexer_direct<
     hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1588,26 +1692,38 @@ using hip_flatten_thread_x_direct = hip_flatten_thread_direct<named_dim::x>;
 using hip_flatten_thread_y_direct = hip_flatten_thread_direct<named_dim::y>;
 using hip_flatten_thread_z_direct = hip_flatten_thread_direct<named_dim::z>;
 
-using hip_flatten_thread_xy_direct = hip_flatten_thread_direct<named_dim::x, named_dim::y>;
-using hip_flatten_thread_xz_direct = hip_flatten_thread_direct<named_dim::x, named_dim::z>;
-using hip_flatten_thread_yx_direct = hip_flatten_thread_direct<named_dim::y, named_dim::x>;
-using hip_flatten_thread_yz_direct = hip_flatten_thread_direct<named_dim::y, named_dim::z>;
-using hip_flatten_thread_zx_direct = hip_flatten_thread_direct<named_dim::z, named_dim::x>;
-using hip_flatten_thread_zy_direct = hip_flatten_thread_direct<named_dim::z, named_dim::y>;
-
-using hip_flatten_thread_xyz_direct = hip_flatten_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_thread_xzy_direct = hip_flatten_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_thread_yxz_direct = hip_flatten_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_thread_yzx_direct = hip_flatten_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_thread_zxy_direct = hip_flatten_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_thread_zyx_direct = hip_flatten_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_thread_xy_direct =
+    hip_flatten_thread_direct<named_dim::x, named_dim::y>;
+using hip_flatten_thread_xz_direct =
+    hip_flatten_thread_direct<named_dim::x, named_dim::z>;
+using hip_flatten_thread_yx_direct =
+    hip_flatten_thread_direct<named_dim::y, named_dim::x>;
+using hip_flatten_thread_yz_direct =
+    hip_flatten_thread_direct<named_dim::y, named_dim::z>;
+using hip_flatten_thread_zx_direct =
+    hip_flatten_thread_direct<named_dim::z, named_dim::x>;
+using hip_flatten_thread_zy_direct =
+    hip_flatten_thread_direct<named_dim::z, named_dim::y>;
+
+using hip_flatten_thread_xyz_direct =
+    hip_flatten_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_thread_xzy_direct =
+    hip_flatten_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_thread_yxz_direct =
+    hip_flatten_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_thread_yzx_direct =
+    hip_flatten_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_thread_zxy_direct =
+    hip_flatten_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_thread_zyx_direct =
+    hip_flatten_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP threads.
  * Reshapes multiple physical threads into a 1D iteration space
  * Uses block-stride looping to exceed the maximum number of physical threads
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_flatten_thread_loop = hip_flatten_indexer_loop<
     hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1615,19 +1731,31 @@ using hip_flatten_thread_x_loop = hip_flatten_thread_loop<named_dim::x>;
 using hip_flatten_thread_y_loop = hip_flatten_thread_loop<named_dim::y>;
 using hip_flatten_thread_z_loop = hip_flatten_thread_loop<named_dim::z>;
 
-using hip_flatten_thread_xy_loop = hip_flatten_thread_loop<named_dim::x, named_dim::y>;
-using hip_flatten_thread_xz_loop = hip_flatten_thread_loop<named_dim::x, named_dim::z>;
-using hip_flatten_thread_yx_loop = hip_flatten_thread_loop<named_dim::y, named_dim::x>;
-using hip_flatten_thread_yz_loop = hip_flatten_thread_loop<named_dim::y, named_dim::z>;
-using hip_flatten_thread_zx_loop = hip_flatten_thread_loop<named_dim::z, named_dim::x>;
-using hip_flatten_thread_zy_loop = hip_flatten_thread_loop<named_dim::z, named_dim::y>;
-
-using hip_flatten_thread_xyz_loop = hip_flatten_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_thread_xzy_loop = hip_flatten_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_thread_yxz_loop = hip_flatten_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_thread_yzx_loop = hip_flatten_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_thread_zxy_loop = hip_flatten_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_thread_zyx_loop = hip_flatten_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_thread_xy_loop =
+    hip_flatten_thread_loop<named_dim::x, named_dim::y>;
+using hip_flatten_thread_xz_loop =
+    hip_flatten_thread_loop<named_dim::x, named_dim::z>;
+using hip_flatten_thread_yx_loop =
+    hip_flatten_thread_loop<named_dim::y, named_dim::x>;
+using hip_flatten_thread_yz_loop =
+    hip_flatten_thread_loop<named_dim::y, named_dim::z>;
+using hip_flatten_thread_zx_loop =
+    hip_flatten_thread_loop<named_dim::z, named_dim::x>;
+using hip_flatten_thread_zy_loop =
+    hip_flatten_thread_loop<named_dim::z, named_dim::y>;
+
+using hip_flatten_thread_xyz_loop =
+    hip_flatten_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_thread_xzy_loop =
+    hip_flatten_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_thread_yxz_loop =
+    hip_flatten_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_thread_yzx_loop =
+    hip_flatten_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_thread_zxy_loop =
+    hip_flatten_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_thread_zyx_loop =
+    hip_flatten_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -1635,7 +1763,7 @@ using hip_flatten_thread_zyx_loop = hip_flatten_thread_loop<named_dim::z, named_
  * This is the lowest overhead mapping, but requires that there are enough
  * physical blocks to fit all of the direct map requests.
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_block_direct = hip_indexer_direct<
     hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1650,22 +1778,28 @@ using hip_block_yz_direct = hip_block_direct<named_dim::y, named_dim::z>;
 using hip_block_zx_direct = hip_block_direct<named_dim::z, named_dim::x>;
 using hip_block_zy_direct = hip_block_direct<named_dim::z, named_dim::y>;
 
-using hip_block_xyz_direct = hip_block_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_block_xzy_direct = hip_block_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_block_yxz_direct = hip_block_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_block_yzx_direct = hip_block_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_block_zxy_direct = hip_block_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_block_zyx_direct = hip_block_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_block_xyz_direct =
+    hip_block_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_block_xzy_direct =
+    hip_block_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_block_yxz_direct =
+    hip_block_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_block_yzx_direct =
+    hip_block_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_block_zxy_direct =
+    hip_block_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_block_zyx_direct =
+    hip_block_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to HIP blocks.
  * Uses grid-stride looping to exceed the maximum number of blocks
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_block_loop = hip_indexer_loop<
     hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_block_syncable_loop = hip_indexer_syncable_loop<
     hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1680,12 +1814,18 @@ using hip_block_yz_loop = hip_block_loop<named_dim::y, named_dim::z>;
 using hip_block_zx_loop = hip_block_loop<named_dim::z, named_dim::x>;
 using hip_block_zy_loop = hip_block_loop<named_dim::z, named_dim::y>;
 
-using hip_block_xyz_loop = hip_block_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_block_xzy_loop = hip_block_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_block_yxz_loop = hip_block_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_block_yzx_loop = hip_block_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_block_zxy_loop = hip_block_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_block_zyx_loop = hip_block_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_block_xyz_loop =
+    hip_block_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_block_xzy_loop =
+    hip_block_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_block_yxz_loop =
+    hip_block_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_block_yzx_loop =
+    hip_block_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_block_zxy_loop =
+    hip_block_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_block_zyx_loop =
+    hip_block_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP blocks.
@@ -1693,7 +1833,7 @@ using hip_block_zyx_loop = hip_block_loop<named_dim::z, named_dim::y, named_dim:
  * physical blocks to fit all of the direct map requests.
  * Reshapes multiple physical blocks into a 1D iteration space
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_flatten_block_direct = hip_flatten_indexer_direct<
     hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1701,26 +1841,38 @@ using hip_flatten_block_x_direct = hip_flatten_block_direct<named_dim::x>;
 using hip_flatten_block_y_direct = hip_flatten_block_direct<named_dim::y>;
 using hip_flatten_block_z_direct = hip_flatten_block_direct<named_dim::z>;
 
-using hip_flatten_block_xy_direct = hip_flatten_block_direct<named_dim::x, named_dim::y>;
-using hip_flatten_block_xz_direct = hip_flatten_block_direct<named_dim::x, named_dim::z>;
-using hip_flatten_block_yx_direct = hip_flatten_block_direct<named_dim::y, named_dim::x>;
-using hip_flatten_block_yz_direct = hip_flatten_block_direct<named_dim::y, named_dim::z>;
-using hip_flatten_block_zx_direct = hip_flatten_block_direct<named_dim::z, named_dim::x>;
-using hip_flatten_block_zy_direct = hip_flatten_block_direct<named_dim::z, named_dim::y>;
-
-using hip_flatten_block_xyz_direct = hip_flatten_block_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_block_xzy_direct = hip_flatten_block_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_block_yxz_direct = hip_flatten_block_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_block_yzx_direct = hip_flatten_block_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_block_zxy_direct = hip_flatten_block_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_block_zyx_direct = hip_flatten_block_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_block_xy_direct =
+    hip_flatten_block_direct<named_dim::x, named_dim::y>;
+using hip_flatten_block_xz_direct =
+    hip_flatten_block_direct<named_dim::x, named_dim::z>;
+using hip_flatten_block_yx_direct =
+    hip_flatten_block_direct<named_dim::y, named_dim::x>;
+using hip_flatten_block_yz_direct =
+    hip_flatten_block_direct<named_dim::y, named_dim::z>;
+using hip_flatten_block_zx_direct =
+    hip_flatten_block_direct<named_dim::z, named_dim::x>;
+using hip_flatten_block_zy_direct =
+    hip_flatten_block_direct<named_dim::z, named_dim::y>;
+
+using hip_flatten_block_xyz_direct =
+    hip_flatten_block_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_block_xzy_direct =
+    hip_flatten_block_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_block_yxz_direct =
+    hip_flatten_block_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_block_yzx_direct =
+    hip_flatten_block_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_block_zxy_direct =
+    hip_flatten_block_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_block_zyx_direct =
+    hip_flatten_block_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP blocks.
  * Reshapes multiple physical blocks into a 1D iteration space
  * Uses block-stride looping to exceed the maximum number of physical blocks
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_flatten_block_loop = hip_flatten_indexer_loop<
     hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1728,19 +1880,31 @@ using hip_flatten_block_x_loop = hip_flatten_block_loop<named_dim::x>;
 using hip_flatten_block_y_loop = hip_flatten_block_loop<named_dim::y>;
 using hip_flatten_block_z_loop = hip_flatten_block_loop<named_dim::z>;
 
-using hip_flatten_block_xy_loop = hip_flatten_block_loop<named_dim::x, named_dim::y>;
-using hip_flatten_block_xz_loop = hip_flatten_block_loop<named_dim::x, named_dim::z>;
-using hip_flatten_block_yx_loop = hip_flatten_block_loop<named_dim::y, named_dim::x>;
-using hip_flatten_block_yz_loop = hip_flatten_block_loop<named_dim::y, named_dim::z>;
-using hip_flatten_block_zx_loop = hip_flatten_block_loop<named_dim::z, named_dim::x>;
-using hip_flatten_block_zy_loop = hip_flatten_block_loop<named_dim::z, named_dim::y>;
-
-using hip_flatten_block_xyz_loop = hip_flatten_block_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_block_xzy_loop = hip_flatten_block_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_block_yxz_loop = hip_flatten_block_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_block_yzx_loop = hip_flatten_block_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_block_zxy_loop = hip_flatten_block_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_block_zyx_loop = hip_flatten_block_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_block_xy_loop =
+    hip_flatten_block_loop<named_dim::x, named_dim::y>;
+using hip_flatten_block_xz_loop =
+    hip_flatten_block_loop<named_dim::x, named_dim::z>;
+using hip_flatten_block_yx_loop =
+    hip_flatten_block_loop<named_dim::y, named_dim::x>;
+using hip_flatten_block_yz_loop =
+    hip_flatten_block_loop<named_dim::y, named_dim::z>;
+using hip_flatten_block_zx_loop =
+    hip_flatten_block_loop<named_dim::z, named_dim::x>;
+using hip_flatten_block_zy_loop =
+    hip_flatten_block_loop<named_dim::z, named_dim::y>;
+
+using hip_flatten_block_xyz_loop =
+    hip_flatten_block_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_block_xzy_loop =
+    hip_flatten_block_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_block_yxz_loop =
+    hip_flatten_block_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_block_yzx_loop =
+    hip_flatten_block_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_block_zxy_loop =
+    hip_flatten_block_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_block_zyx_loop =
+    hip_flatten_block_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -1748,9 +1912,11 @@ using hip_flatten_block_zyx_loop = hip_flatten_block_loop<named_dim::z, named_di
  * This is the lowest overhead mapping, but requires that there are enough
  * physical threads to fit all of the direct map requests.
  */
-template < named_dim ... dims >
-using hip_global_direct = hip_indexer_direct<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template <named_dim... dims>
+using hip_global_direct =
+    hip_indexer_direct<hip::IndexGlobal<dims,
+                                        named_usage::unspecified,
+                                        named_usage::unspecified>...>;
 
 using hip_global_x_direct = hip_global_direct<named_dim::x>;
 using hip_global_y_direct = hip_global_direct<named_dim::y>;
@@ -1763,24 +1929,34 @@ using hip_global_yz_direct = hip_global_direct<named_dim::y, named_dim::z>;
 using hip_global_zx_direct = hip_global_direct<named_dim::z, named_dim::x>;
 using hip_global_zy_direct = hip_global_direct<named_dim::z, named_dim::y>;
 
-using hip_global_xyz_direct = hip_global_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_global_xzy_direct = hip_global_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_global_yxz_direct = hip_global_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_global_yzx_direct = hip_global_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_global_zxy_direct = hip_global_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_global_zyx_direct = hip_global_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_global_xyz_direct =
+    hip_global_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_global_xzy_direct =
+    hip_global_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_global_yxz_direct =
+    hip_global_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_global_yzx_direct =
+    hip_global_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_global_zxy_direct =
+    hip_global_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_global_zyx_direct =
+    hip_global_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to HIP global threads.
  * Uses grid-stride looping to exceed the maximum number of global threads
  */
-template < named_dim ... dims >
-using hip_global_loop = hip_indexer_loop<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
-
-template < named_dim ... dims >
-using hip_global_syncable_loop = hip_indexer_syncable_loop<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template <named_dim... dims>
+using hip_global_loop =
+    hip_indexer_loop<hip::IndexGlobal<dims,
+                                      named_usage::unspecified,
+                                      named_usage::unspecified>...>;
+
+template <named_dim... dims>
+using hip_global_syncable_loop =
+    hip_indexer_syncable_loop<hip::IndexGlobal<dims,
+                                               named_usage::unspecified,
+                                               named_usage::unspecified>...>;
 
 using hip_global_x_loop = hip_global_loop<named_dim::x>;
 using hip_global_y_loop = hip_global_loop<named_dim::y>;
@@ -1793,12 +1969,18 @@ using hip_global_yz_loop = hip_global_loop<named_dim::y, named_dim::z>;
 using hip_global_zx_loop = hip_global_loop<named_dim::z, named_dim::x>;
 using hip_global_zy_loop = hip_global_loop<named_dim::z, named_dim::y>;
 
-using hip_global_xyz_loop = hip_global_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_global_xzy_loop = hip_global_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_global_yxz_loop = hip_global_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_global_yzx_loop = hip_global_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_global_zxy_loop = hip_global_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_global_zyx_loop = hip_global_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_global_xyz_loop =
+    hip_global_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_global_xzy_loop =
+    hip_global_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_global_yxz_loop =
+    hip_global_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_global_yzx_loop =
+    hip_global_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_global_zxy_loop =
+    hip_global_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_global_zyx_loop =
+    hip_global_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP global threads.
@@ -1806,54 +1988,83 @@ using hip_global_zyx_loop = hip_global_loop<named_dim::z, named_dim::y, named_di
  * physical global threads to fit all of the direct map requests.
  * Reshapes multiple physical global threads into a 1D iteration space
  */
-template < named_dim ... dims >
-using hip_flatten_global_direct = hip_flatten_indexer_direct<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template <named_dim... dims>
+using hip_flatten_global_direct =
+    hip_flatten_indexer_direct<hip::IndexGlobal<dims,
+                                                named_usage::unspecified,
+                                                named_usage::unspecified>...>;
 
 using hip_flatten_global_x_direct = hip_flatten_global_direct<named_dim::x>;
 using hip_flatten_global_y_direct = hip_flatten_global_direct<named_dim::y>;
 using hip_flatten_global_z_direct = hip_flatten_global_direct<named_dim::z>;
 
-using hip_flatten_global_xy_direct = hip_flatten_global_direct<named_dim::x, named_dim::y>;
-using hip_flatten_global_xz_direct = hip_flatten_global_direct<named_dim::x, named_dim::z>;
-using hip_flatten_global_yx_direct = hip_flatten_global_direct<named_dim::y, named_dim::x>;
-using hip_flatten_global_yz_direct = hip_flatten_global_direct<named_dim::y, named_dim::z>;
-using hip_flatten_global_zx_direct = hip_flatten_global_direct<named_dim::z, named_dim::x>;
-using hip_flatten_global_zy_direct = hip_flatten_global_direct<named_dim::z, named_dim::y>;
-
-using hip_flatten_global_xyz_direct = hip_flatten_global_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_global_xzy_direct = hip_flatten_global_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_global_yxz_direct = hip_flatten_global_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_global_yzx_direct = hip_flatten_global_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_global_zxy_direct = hip_flatten_global_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_global_zyx_direct = hip_flatten_global_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_global_xy_direct =
+    hip_flatten_global_direct<named_dim::x, named_dim::y>;
+using hip_flatten_global_xz_direct =
+    hip_flatten_global_direct<named_dim::x, named_dim::z>;
+using hip_flatten_global_yx_direct =
+    hip_flatten_global_direct<named_dim::y, named_dim::x>;
+using hip_flatten_global_yz_direct =
+    hip_flatten_global_direct<named_dim::y, named_dim::z>;
+using hip_flatten_global_zx_direct =
+    hip_flatten_global_direct<named_dim::z, named_dim::x>;
+using hip_flatten_global_zy_direct =
+    hip_flatten_global_direct<named_dim::z, named_dim::y>;
+
+using hip_flatten_global_xyz_direct =
+    hip_flatten_global_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_global_xzy_direct =
+    hip_flatten_global_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_global_yxz_direct =
+    hip_flatten_global_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_global_yzx_direct =
+    hip_flatten_global_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_global_zxy_direct =
+    hip_flatten_global_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_global_zyx_direct =
+    hip_flatten_global_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP global threads.
  * Reshapes multiple physical global threads into a 1D iteration space
- * Uses global thread-stride looping to exceed the maximum number of physical global threads
+ * Uses global thread-stride looping to exceed the maximum number of physical
+ * global threads
  */
-template < named_dim ... dims >
-using hip_flatten_global_loop = hip_flatten_indexer_loop<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template <named_dim... dims>
+using hip_flatten_global_loop =
+    hip_flatten_indexer_loop<hip::IndexGlobal<dims,
+                                              named_usage::unspecified,
+                                              named_usage::unspecified>...>;
 
 using hip_flatten_global_x_loop = hip_flatten_global_loop<named_dim::x>;
 using hip_flatten_global_y_loop = hip_flatten_global_loop<named_dim::y>;
 using hip_flatten_global_z_loop = hip_flatten_global_loop<named_dim::z>;
 
-using hip_flatten_global_xy_loop = hip_flatten_global_loop<named_dim::x, named_dim::y>;
-using hip_flatten_global_xz_loop = hip_flatten_global_loop<named_dim::x, named_dim::z>;
-using hip_flatten_global_yx_loop = hip_flatten_global_loop<named_dim::y, named_dim::x>;
-using hip_flatten_global_yz_loop = hip_flatten_global_loop<named_dim::y, named_dim::z>;
-using hip_flatten_global_zx_loop = hip_flatten_global_loop<named_dim::z, named_dim::x>;
-using hip_flatten_global_zy_loop = hip_flatten_global_loop<named_dim::z, named_dim::y>;
-
-using hip_flatten_global_xyz_loop = hip_flatten_global_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_global_xzy_loop = hip_flatten_global_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_global_yxz_loop = hip_flatten_global_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_global_yzx_loop = hip_flatten_global_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_global_zxy_loop = hip_flatten_global_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_global_zyx_loop = hip_flatten_global_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_global_xy_loop =
+    hip_flatten_global_loop<named_dim::x, named_dim::y>;
+using hip_flatten_global_xz_loop =
+    hip_flatten_global_loop<named_dim::x, named_dim::z>;
+using hip_flatten_global_yx_loop =
+    hip_flatten_global_loop<named_dim::y, named_dim::x>;
+using hip_flatten_global_yz_loop =
+    hip_flatten_global_loop<named_dim::y, named_dim::z>;
+using hip_flatten_global_zx_loop =
+    hip_flatten_global_loop<named_dim::z, named_dim::x>;
+using hip_flatten_global_zy_loop =
+    hip_flatten_global_loop<named_dim::z, named_dim::y>;
+
+using hip_flatten_global_xyz_loop =
+    hip_flatten_global_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_global_xzy_loop =
+    hip_flatten_global_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_global_yxz_loop =
+    hip_flatten_global_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_global_yzx_loop =
+    hip_flatten_global_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_global_zxy_loop =
+    hip_flatten_global_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_global_zyx_loop =
+    hip_flatten_global_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -1861,271 +2072,460 @@ using hip_flatten_global_zyx_loop = hip_flatten_global_loop<named_dim::z, named_
  * This is the lowest overhead mapping, but requires that there are enough
  * physical threads to fit all of the direct map requests.
  */
-template < int X_BLOCK_SIZE >
-using hip_thread_size_x_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using hip_thread_size_y_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using hip_thread_size_z_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_xy_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_xz_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_yx_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_yz_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_zx_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_zy_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_xyz_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_xzy_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_yxz_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_yzx_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_zxy_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_zyx_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
+template <int X_BLOCK_SIZE>
+using hip_thread_size_x_direct =
+    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE>
+using hip_thread_size_y_direct =
+    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE>
+using hip_thread_size_z_direct =
+    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_xy_direct =
+    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_xz_direct =
+    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_yx_direct =
+    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_yz_direct =
+    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_zx_direct =
+    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_zy_direct =
+    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_xyz_direct =
+    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_xzy_direct =
+    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_yxz_direct =
+    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_yzx_direct =
+    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_zxy_direct =
+    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_zyx_direct =
+    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>>;
+
+
+template <int X_GRID_SIZE>
 using hip_block_size_x_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
+template <int Y_GRID_SIZE>
 using hip_block_size_y_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
+template <int Z_GRID_SIZE>
 using hip_block_size_z_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>>;
 
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_xy_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_xz_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_yx_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_yz_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_zx_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_zy_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_xyz_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_xzy_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_yxz_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_yzx_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_zxy_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_zyx_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_x_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_y_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_z_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xy_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xz_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yx_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yz_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zx_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zy_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xyz_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xzy_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yxz_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yzx_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zxy_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zyx_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_xy_direct =
+    hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_xz_direct =
+    hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_yx_direct =
+    hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_yz_direct =
+    hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_zx_direct =
+    hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_zy_direct =
+    hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_xyz_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                                                     hip::block_y<Y_GRID_SIZE>,
+                                                     hip::block_z<Z_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_xzy_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                                                     hip::block_z<Z_GRID_SIZE>,
+                                                     hip::block_y<Y_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_yxz_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                                                     hip::block_x<X_GRID_SIZE>,
+                                                     hip::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_yzx_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                                                     hip::block_z<Z_GRID_SIZE>,
+                                                     hip::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_zxy_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                                                     hip::block_x<X_GRID_SIZE>,
+                                                     hip::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_zyx_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                                                     hip::block_y<Y_GRID_SIZE>,
+                                                     hip::block_x<X_GRID_SIZE>>;
+
+
+template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_x_direct =
+    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_y_direct =
+    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_z_direct =
+    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xy_direct =
+    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xz_direct =
+    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yx_direct =
+    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yz_direct =
+    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zx_direct =
+    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zy_direct =
+    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xyz_direct =
+    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xzy_direct =
+    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yxz_direct =
+    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yzx_direct =
+    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zxy_direct =
+    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zyx_direct =
+    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*!
  * Maps segment indices to HIP global threads.
  * Uses grid-stride looping to exceed the maximum number of global threads
  */
-template < int X_BLOCK_SIZE >
+template <int X_BLOCK_SIZE>
 using hip_thread_size_x_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
+template <int Y_BLOCK_SIZE>
 using hip_thread_size_y_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
+template <int Z_BLOCK_SIZE>
 using hip_thread_size_z_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>>;
 
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_xy_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_xz_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_yx_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_yz_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_zx_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_zy_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_xyz_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_xzy_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_yxz_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_yzx_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_zxy_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_zyx_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_xy_loop =
+    hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_xz_loop =
+    hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_yx_loop =
+    hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_yz_loop =
+    hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_zx_loop =
+    hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_zy_loop =
+    hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_xyz_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                                                  hip::thread_y<Y_BLOCK_SIZE>,
+                                                  hip::thread_z<Z_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_xzy_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                                                  hip::thread_z<Z_BLOCK_SIZE>,
+                                                  hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_yxz_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                                                  hip::thread_x<X_BLOCK_SIZE>,
+                                                  hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_yzx_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                                                  hip::thread_z<Z_BLOCK_SIZE>,
+                                                  hip::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_zxy_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                                                  hip::thread_x<X_BLOCK_SIZE>,
+                                                  hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_zyx_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                                                  hip::thread_y<Y_BLOCK_SIZE>,
+                                                  hip::thread_x<X_BLOCK_SIZE>>;
+
+
+template <int X_GRID_SIZE>
 using hip_block_size_x_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
+template <int Y_GRID_SIZE>
 using hip_block_size_y_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
+template <int Z_GRID_SIZE>
 using hip_block_size_z_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>>;
 
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_xy_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_xz_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_yx_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_yz_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_zx_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_zy_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_xyz_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_xzy_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_yxz_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_yzx_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_zxy_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_zyx_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_x_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_y_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_z_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xy_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xz_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yx_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yz_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zx_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zy_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xyz_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xzy_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yxz_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yzx_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zxy_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zyx_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_xy_loop =
+    hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_xz_loop =
+    hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_yx_loop =
+    hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_yz_loop =
+    hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_zx_loop =
+    hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_zy_loop =
+    hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_xyz_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                                                 hip::block_y<Y_GRID_SIZE>,
+                                                 hip::block_z<Z_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_xzy_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                                                 hip::block_z<Z_GRID_SIZE>,
+                                                 hip::block_y<Y_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_yxz_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                                                 hip::block_x<X_GRID_SIZE>,
+                                                 hip::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_yzx_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                                                 hip::block_z<Z_GRID_SIZE>,
+                                                 hip::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_zxy_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                                                 hip::block_x<X_GRID_SIZE>,
+                                                 hip::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_zyx_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                                                 hip::block_y<Y_GRID_SIZE>,
+                                                 hip::block_x<X_GRID_SIZE>>;
+
+
+template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_x_loop =
+    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_y_loop =
+    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_z_loop =
+    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xy_loop =
+    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xz_loop =
+    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yx_loop =
+    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yz_loop =
+    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zx_loop =
+    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zy_loop =
+    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xyz_loop =
+    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xzy_loop =
+    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yxz_loop =
+    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yzx_loop =
+    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zxy_loop =
+    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zyx_loop =
+    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*
  * Maps segment indices to flattened HIP global threads.
@@ -2133,272 +2533,507 @@ using hip_global_size_zyx_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_
  * physical global threads to fit all of the direct map requests.
  * Reshapes multiple physical global threads into a 1D iteration space
  */
-template < int X_BLOCK_SIZE >
-using hip_flatten_thread_size_x_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_y_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_z_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_xy_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_xz_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_yx_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_yz_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_zx_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_zy_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_xyz_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_xzy_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_yxz_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_yzx_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_zxy_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_zyx_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
-using hip_flatten_block_size_x_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
-using hip_flatten_block_size_y_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
-using hip_flatten_block_size_z_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_xy_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_xz_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_yx_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_yz_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_zx_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_zy_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_xyz_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_xzy_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_yxz_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_yzx_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_zxy_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_zyx_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_x_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_y_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_z_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xy_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xz_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yx_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yz_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zx_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zy_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xyz_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xzy_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yxz_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yzx_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zxy_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zyx_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_BLOCK_SIZE>
+using hip_flatten_thread_size_x_direct =
+    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_y_direct =
+    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_z_direct =
+    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_xy_direct =
+    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_xz_direct =
+    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_yx_direct =
+    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_yz_direct =
+    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_zx_direct =
+    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_zy_direct =
+    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_xyz_direct =
+    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_xzy_direct =
+    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_yxz_direct =
+    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_yzx_direct =
+    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_zxy_direct =
+    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_zyx_direct =
+    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>>;
+
+
+template <int X_GRID_SIZE>
+using hip_flatten_block_size_x_direct =
+    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE>
+using hip_flatten_block_size_y_direct =
+    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE>
+using hip_flatten_block_size_z_direct =
+    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_xy_direct =
+    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_xz_direct =
+    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_yx_direct =
+    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_yz_direct =
+    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_zx_direct =
+    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_zy_direct =
+    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_xyz_direct =
+    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_xzy_direct =
+    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_yxz_direct =
+    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_yzx_direct =
+    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_zxy_direct =
+    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_zyx_direct =
+    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>>;
+
+
+template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_x_direct =
+    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_y_direct =
+    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_z_direct =
+    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xy_direct =
+    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xz_direct =
+    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yx_direct =
+    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yz_direct =
+    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zx_direct =
+    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zy_direct =
+    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xyz_direct =
+    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xzy_direct =
+    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yxz_direct =
+    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yzx_direct =
+    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zxy_direct =
+    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zyx_direct =
+    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*
  * Maps segment indices to flattened HIP global threads.
  * Reshapes multiple physical global threads into a 1D iteration space
- * Uses global thread-stride looping to exceed the maximum number of physical global threads
+ * Uses global thread-stride looping to exceed the maximum number of physical
+ * global threads
  */
-template < int X_BLOCK_SIZE >
-using hip_flatten_thread_size_x_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_y_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_z_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_xy_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_xz_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_yx_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_yz_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_zx_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_zy_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_xyz_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_xzy_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_yxz_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_yzx_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_zxy_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_zyx_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
-using hip_flatten_block_size_x_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
-using hip_flatten_block_size_y_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
-using hip_flatten_block_size_z_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_xy_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_xz_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_yx_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_yz_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_zx_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_zy_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_xyz_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_xzy_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_yxz_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_yzx_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_zxy_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_zyx_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_x_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_y_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_z_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xy_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                 hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xz_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                 hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yx_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                 hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yz_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                 hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zx_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                 hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zy_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                 hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xyz_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xzy_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yxz_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yzx_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zxy_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zyx_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_BLOCK_SIZE>
+using hip_flatten_thread_size_x_loop =
+    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_y_loop =
+    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_z_loop =
+    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_xy_loop =
+    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_xz_loop =
+    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_yx_loop =
+    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_yz_loop =
+    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_zx_loop =
+    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_zy_loop =
+    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_xyz_loop =
+    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_xzy_loop =
+    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_yxz_loop =
+    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_yzx_loop =
+    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_zxy_loop =
+    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_zyx_loop =
+    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>>;
+
+
+template <int X_GRID_SIZE>
+using hip_flatten_block_size_x_loop =
+    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE>
+using hip_flatten_block_size_y_loop =
+    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE>
+using hip_flatten_block_size_z_loop =
+    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_xy_loop =
+    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_xz_loop =
+    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_yx_loop =
+    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_yz_loop =
+    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_zx_loop =
+    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_zy_loop =
+    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_xyz_loop =
+    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_xzy_loop =
+    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_yxz_loop =
+    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_yzx_loop =
+    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_zxy_loop =
+    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_zyx_loop =
+    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>>;
+
+
+template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_x_loop =
+    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_y_loop =
+    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_z_loop =
+    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xy_loop =
+    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xz_loop =
+    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yx_loop =
+    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yz_loop =
+    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zx_loop =
+    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zy_loop =
+    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xyz_loop =
+    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xzy_loop =
+    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yxz_loop =
+    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yzx_loop =
+    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zxy_loop =
+    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zyx_loop =
+    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 
 /*
diff --git a/include/RAJA/policy/hip/raja_hiperrchk.hpp b/include/RAJA/policy/hip/raja_hiperrchk.hpp
index 5e3a02fb2c..71542f2410 100644
--- a/include/RAJA/policy/hip/raja_hiperrchk.hpp
+++ b/include/RAJA/policy/hip/raja_hiperrchk.hpp
@@ -42,18 +42,18 @@ namespace RAJA
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
-#define hipErrchk(ans)                            \
-  {                                                \
-    ::RAJA::hipAssert((ans), __FILE__, __LINE__); \
+#define hipErrchk(ans)                                                         \
+  {                                                                            \
+    ::RAJA::hipAssert((ans), __FILE__, __LINE__);                              \
   }
 
-inline void hipAssert(hipError_t code,
-                       const char *file,
-                       int line,
-                       bool abort = true)
+inline void
+hipAssert(hipError_t code, const char* file, int line, bool abort = true)
 {
-  if (code != hipSuccess) {
-    if (abort) {
+  if (code != hipSuccess)
+  {
+    if (abort)
+    {
       std::string msg;
       msg += "HIPassert: ";
       msg += hipGetErrorString(code);
@@ -62,9 +62,11 @@ inline void hipAssert(hipError_t code,
       msg += ":";
       msg += std::to_string(line);
       throw std::runtime_error(msg);
-    } else {
-      fprintf(stderr, "HIPassert: %s %s %d\n",
-              hipGetErrorString(code), file, line);
+    }
+    else
+    {
+      fprintf(stderr, "HIPassert: %s %s %d\n", hipGetErrorString(code), file,
+              line);
     }
   }
 }
diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp
index c81adf8e24..1f9bd87fae 100644
--- a/include/RAJA/policy/hip/reduce.hpp
+++ b/include/RAJA/policy/hip/reduce.hpp
@@ -60,47 +60,53 @@ template <typename Combiner>
 struct atomic;
 
 template <typename T>
-struct atomic<sum<T>> {
+struct atomic<sum<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicAdd(RAJA::hip_atomic{}, &val, v);
+    RAJA::atomicAdd(RAJA::hip_atomic {}, &val, v);
   }
 };
 
 template <typename T>
-struct atomic<min<T>> {
+struct atomic<min<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicMin(RAJA::hip_atomic{}, &val, v);
+    RAJA::atomicMin(RAJA::hip_atomic {}, &val, v);
   }
 };
 
 template <typename T>
-struct atomic<max<T>> {
+struct atomic<max<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicMax(RAJA::hip_atomic{}, &val, v);
+    RAJA::atomicMax(RAJA::hip_atomic {}, &val, v);
   }
 };
 
 template <typename T>
-struct atomic<and_bit<T>> {
+struct atomic<and_bit<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicAnd(RAJA::hip_atomic{}, &val, v);
+    RAJA::atomicAnd(RAJA::hip_atomic {}, &val, v);
   }
 };
 
 template <typename T>
-struct atomic<or_bit<T>> {
+struct atomic<or_bit<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicOr(RAJA::hip_atomic{}, &val, v);
+    RAJA::atomicOr(RAJA::hip_atomic {}, &val, v);
   }
 };
 
 template <typename T>
-struct hip_atomic_available {
+struct hip_atomic_available
+{
   static constexpr const bool value =
       (std::is_integral<T>::value && (4 == sizeof(T) || 8 == sizeof(T))) ||
       std::is_same<T, float>::value || std::is_same<T, double>::value;
@@ -118,15 +124,19 @@ namespace impl
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, typename Accessor,
-          int replication, int atomic_stride,
-          typename T, typename TempIterator>
+template <typename Combiner,
+          typename Accessor,
+          int replication,
+          int atomic_stride,
+          typename T,
+          typename TempIterator>
 RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
-                                        T identity,
-                                        TempIterator in_device_mem,
-                                        unsigned int* device_count)
+                                                   T identity,
+                                                   TempIterator in_device_mem,
+                                                   unsigned int* device_count)
 {
-  typename TempIterator::template rebind_accessor<Accessor> device_mem(in_device_mem);
+  typename TempIterator::template rebind_accessor<Accessor> device_mem(
+      in_device_mem);
 
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -137,20 +147,22 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
   int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
   int replicationId = blockId % replication;
-  int slotId = blockId / replication;
+  int slotId        = blockId / replication;
 
-  int maxNumSlots = (numBlocks + replication - 1) / replication;
+  int maxNumSlots       = (numBlocks + replication - 1) / replication;
   unsigned int numSlots = (numBlocks / replication) +
-      ((replicationId < (numBlocks % replication)) ? 1 : 0);
+                          ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
   int atomicOffset = replicationId * atomic_stride;
-  int beginSlots = replicationId * maxNumSlots;
-  int blockSlot = beginSlots + slotId;
+  int beginSlots   = replicationId * maxNumSlots;
+  int blockSlot    = beginSlots + slotId;
 
   T temp = block_reduce<Combiner>(val, identity);
 
-  if (numSlots <= 1u) {
-    if (threadId == 0) {
+  if (numSlots <= 1u)
+  {
+    if (threadId == 0)
+    {
       val = temp;
     }
     return (threadId == 0) ? replicationId : replication;
@@ -158,33 +170,36 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
 
   // one thread per block writes to device_mem
   __shared__ bool isLastBlock;
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     device_mem.set(blockSlot, temp);
     // ensure write visible to all threadblocks
     Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots-1))
-    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots-1));
-    isLastBlock = (old_count == (numSlots-1));
+    unsigned int old_count =
+        ::atomicInc(&device_count[atomicOffset], (numSlots - 1));
+    isLastBlock = (old_count == (numSlots - 1));
   }
 
   // returns non-zero value if any thread passes in a non-zero value
   __syncthreads();
 
   // last block accumulates values from device_mem
-  if (isLastBlock) {
+  if (isLastBlock)
+  {
     temp = identity;
     Accessor::fence_acquire();
 
-    for (unsigned int i = threadId;
-                      i < numSlots;
-                      i += numThreads) {
-      Combiner{}(temp, device_mem.get(beginSlots + i));
+    for (unsigned int i = threadId; i < numSlots; i += numThreads)
+    {
+      Combiner {}(temp, device_mem.get(beginSlots + i));
     }
 
     temp = block_reduce<Combiner>(temp, identity);
 
     // one thread returns value
-    if (threadId == 0) {
+    if (threadId == 0)
+    {
       val = temp;
     }
   }
@@ -192,72 +207,91 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
   return (isLastBlock && threadId == 0) ? replicationId : replication;
 }
 
-namespace expt {
+namespace expt
+{
 
 template <typename ThreadIterationGetter, typename Combiner, typename T>
 RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 {
   const int numThreads = ThreadIterationGetter::size();
-  const int threadId = ThreadIterationGetter::index();
+  const int threadId   = ThreadIterationGetter::index();
 
-  const int warpId = threadId % RAJA::policy::hip::device_constants.WARP_SIZE;
+  const int warpId  = threadId % RAJA::policy::hip::device_constants.WARP_SIZE;
   const int warpNum = threadId / RAJA::policy::hip::device_constants.WARP_SIZE;
 
   T temp = val;
 
-  if (numThreads % RAJA::policy::hip::device_constants.WARP_SIZE == 0) {
+  if (numThreads % RAJA::policy::hip::device_constants.WARP_SIZE == 0)
+  {
 
     // reduce each warp
-    for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2)
+    {
       T rhs = RAJA::hip::impl::shfl_xor_sync(temp, i);
-      temp = Combiner{}(temp, rhs);
+      temp  = Combiner {}(temp, rhs);
     }
-
-  } else {
+  }
+  else
+  {
 
     // reduce each warp
-    for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2)
+    {
       int srcLane = threadId ^ i;
-      T rhs = RAJA::hip::impl::shfl_sync(temp, srcLane);
+      T rhs       = RAJA::hip::impl::shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
-        temp = Combiner{}(temp, rhs);
+      if (srcLane < numThreads)
+      {
+        temp = Combiner {}(temp, rhs);
       }
     }
   }
 
-  static_assert(RAJA::policy::hip::device_constants.MAX_WARPS <= RAJA::policy::hip::device_constants.WARP_SIZE,
-               "Max Warps must be less than or equal to Warp Size for this algorithm to work");
+  static_assert(RAJA::policy::hip::device_constants.MAX_WARPS <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
+                "Max Warps must be less than or equal to Warp Size for this "
+                "algorithm to work");
 
   // reduce per warp values
-  if (numThreads > RAJA::policy::hip::device_constants.WARP_SIZE) {
+  if (numThreads > RAJA::policy::hip::device_constants.WARP_SIZE)
+  {
 
     // Need to separate declaration and initialization for clang-hip
-    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, RAJA::policy::hip::device_constants.MAX_WARPS>)];
+    __shared__ unsigned char tmpsd[sizeof(
+        RAJA::detail::SoAArray<T,
+                               RAJA::policy::hip::device_constants.MAX_WARPS>)];
 
     // Partial placement new: Should call new(tmpsd) here but recasting memory
     // to avoid calling constructor/destructor in shared memory.
-    RAJA::detail::SoAArray<T, RAJA::policy::hip::device_constants.MAX_WARPS> * sd = reinterpret_cast<RAJA::detail::SoAArray<T, RAJA::policy::hip::device_constants.MAX_WARPS> *>(tmpsd);
+    RAJA::detail::SoAArray<T, RAJA::policy::hip::device_constants.MAX_WARPS>*
+        sd = reinterpret_cast<RAJA::detail::SoAArray<
+            T, RAJA::policy::hip::device_constants.MAX_WARPS>*>(tmpsd);
 
     // write per warp values to shared memory
-    if (warpId == 0) {
+    if (warpId == 0)
+    {
       sd->set(warpNum, temp);
     }
 
     __syncthreads();
 
-    if (warpNum == 0) {
+    if (warpNum == 0)
+    {
 
       // read per warp values
-      if (warpId * RAJA::policy::hip::device_constants.WARP_SIZE < numThreads) {
+      if (warpId * RAJA::policy::hip::device_constants.WARP_SIZE < numThreads)
+      {
         temp = sd->get(warpId);
-      } else {
+      }
+      else
+      {
         temp = identity;
       }
 
-      for (int i = 1; i < RAJA::policy::hip::device_constants.MAX_WARPS; i *= 2) {
+      for (int i = 1; i < RAJA::policy::hip::device_constants.MAX_WARPS; i *= 2)
+      {
         T rhs = RAJA::hip::impl::shfl_xor_sync(temp, i);
-        temp = Combiner{}(temp, rhs);
+        temp  = Combiner {}(temp, rhs);
       }
     }
 
@@ -269,67 +303,77 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 
 
 template <typename GlobalIterationGetter, typename OP, typename T>
-RAJA_DEVICE RAJA_INLINE void grid_reduce( T * device_target,
-                                          T val,
-                                          RAJA::detail::SoAPtr<T,RAJA::hip::device_mempool_type> device_mem,
-                                          unsigned int * device_count)
+RAJA_DEVICE RAJA_INLINE void
+grid_reduce(T* device_target,
+            T val,
+            RAJA::detail::SoAPtr<T, RAJA::hip::device_mempool_type> device_mem,
+            unsigned int* device_count)
 {
-  using BlockIterationGetter = typename get_index_block<GlobalIterationGetter>::type;
-  using ThreadIterationGetter = typename get_index_thread<GlobalIterationGetter>::type;
+  using BlockIterationGetter =
+      typename get_index_block<GlobalIterationGetter>::type;
+  using ThreadIterationGetter =
+      typename get_index_thread<GlobalIterationGetter>::type;
 
-  const int numBlocks = BlockIterationGetter::size();
-  const int numThreads = ThreadIterationGetter::size();
+  const int numBlocks            = BlockIterationGetter::size();
+  const int numThreads           = ThreadIterationGetter::size();
   const unsigned int wrap_around = numBlocks - 1;
 
-  const int blockId = BlockIterationGetter::index();
+  const int blockId  = BlockIterationGetter::index();
   const int threadId = ThreadIterationGetter::index();
 
   T temp = block_reduce<ThreadIterationGetter, OP>(val, OP::identity());
 
   // one thread per block writes to device_mem
   bool lastBlock = false;
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     device_mem.set(blockId, temp);
     // ensure write visible to all threadblocks
     __threadfence();
     // increment counter, (wraps back to zero if old count == wrap_around)
     unsigned int old_count = ::atomicInc(device_count, wrap_around);
-    lastBlock = (old_count == wrap_around);
+    lastBlock              = (old_count == wrap_around);
   }
 
   // returns non-zero value if any thread passes in a non-zero value
   lastBlock = __syncthreads_or(lastBlock);
 
   // last block accumulates values from device_mem
-  if (lastBlock) {
+  if (lastBlock)
+  {
     temp = OP::identity();
     __threadfence();
 
-    for (int i = threadId; i < numBlocks; i += numThreads) {
-      temp = OP{}(temp, device_mem.get(i));
+    for (int i = threadId; i < numBlocks; i += numThreads)
+    {
+      temp = OP {}(temp, device_mem.get(i));
     }
 
     temp = block_reduce<ThreadIterationGetter, OP>(temp, OP::identity());
 
     // one thread returns value
-    if (threadId == 0) {
+    if (threadId == 0)
+    {
       *device_target = temp;
     }
   }
 }
 
-} //  namespace expt
+}  //  namespace expt
 
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, typename Accessor,
-          int replication, int atomic_stride,
+template <typename Combiner,
+          typename Accessor,
+          int replication,
+          int atomic_stride,
           typename T>
-RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
-                                               T identity,
-                                               T* device_mem,
-                                               unsigned int* device_count)
+RAJA_DEVICE RAJA_INLINE int
+grid_reduce_atomic_device_init(T& val,
+                               T identity,
+                               T* device_mem,
+                               unsigned int* device_count)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -338,24 +382,28 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
                 (gridDim.x * gridDim.y) * blockIdx.z;
   int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
-  int replicationId = (blockId%replication);
-  int atomicOffset = replicationId*atomic_stride;
+  int replicationId = (blockId % replication);
+  int atomicOffset  = replicationId * atomic_stride;
 
   unsigned int numSlots = (numBlocks / replication) +
-      ((replicationId < (numBlocks % replication)) ? 1 : 0);
+                          ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
-  if (numSlots <= 1u) {
+  if (numSlots <= 1u)
+  {
     T temp = block_reduce<Combiner>(val, identity);
-    if (threadId == 0) {
+    if (threadId == 0)
+    {
       val = temp;
     }
     return (threadId == 0) ? replicationId : replication;
   }
 
   // the first block of each replication initializes device_mem
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     unsigned int old_val = ::atomicCAS(&device_count[atomicOffset], 0u, 1u);
-    if (old_val == 0u) {
+    if (old_val == 0u)
+    {
       Accessor::set(device_mem, atomicOffset, identity);
       Accessor::fence_release();
       ::atomicAdd(&device_count[atomicOffset], 1u);
@@ -366,19 +414,22 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
 
   // one thread per block performs an atomic on device_mem
   bool isLastBlock = false;
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     // wait for device_mem to be initialized
     while (::atomicAdd(&device_count[atomicOffset], 0u) < 2u)
       ;
     Accessor::fence_acquire();
-    RAJA::reduce::hip::atomic<Combiner>{}(device_mem[atomicOffset], temp);
+    RAJA::reduce::hip::atomic<Combiner> {}(device_mem[atomicOffset], temp);
     Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots+1))
-    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots+1));
-    isLastBlock = (old_count == (numSlots+1));
+    unsigned int old_count =
+        ::atomicInc(&device_count[atomicOffset], (numSlots + 1));
+    isLastBlock = (old_count == (numSlots + 1));
 
     // the last block for each replication gets the value from device_mem
-    if (isLastBlock) {
+    if (isLastBlock)
+    {
       Accessor::fence_acquire();
       val = Accessor::get(device_mem, atomicOffset);
     }
@@ -389,9 +440,8 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
 
 //! reduce values in block into thread 0 and atomically combines into device_mem
 template <typename Combiner, int replication, int atomic_stride, typename T>
-RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
-                                                            T identity,
-                                                            T* device_mem)
+RAJA_DEVICE RAJA_INLINE void
+grid_reduce_atomic_host_init(T& val, T identity, T* device_mem)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -399,16 +449,16 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
   int blockId = blockIdx.x + gridDim.x * blockIdx.y +
                 (gridDim.x * gridDim.y) * blockIdx.z;
 
-  int replicationId = (blockId%replication);
-  int atomicOffset = replicationId*atomic_stride;
+  int replicationId = (blockId % replication);
+  int atomicOffset  = replicationId * atomic_stride;
 
   T temp = block_reduce<Combiner>(val, identity);
 
   // one thread per block performs an atomic on device_mem
-  if (threadId == 0 && temp != identity) {
-    RAJA::reduce::hip::atomic<Combiner>{}(device_mem[atomicOffset], temp);
+  if (threadId == 0 && temp != identity)
+  {
+    RAJA::reduce::hip::atomic<Combiner> {}(device_mem[atomicOffset], temp);
   }
-
 }
 
 }  // namespace impl
@@ -420,12 +470,14 @@ class PinnedTally
 {
 public:
   //! Object put in Pinned memory with value and pointer to next Node
-  struct Node {
+  struct Node
+  {
     Node* next;
     T values[num_slots];
   };
   //! Object per resource to keep track of pinned memory nodes
-  struct ResourceNode {
+  struct ResourceNode
+  {
     ResourceNode* next;
     ::RAJA::resources::Hip res;
     Node* node_list;
@@ -478,14 +530,19 @@ class PinnedTally
 
     const ResourceNodeIterator& operator++()
     {
-      if (m_n->next) {
+      if (m_n->next)
+      {
         m_n = m_n->next;
-      } else if (m_rn->next) {
+      }
+      else if (m_rn->next)
+      {
         m_rn = m_rn->next;
-        m_n = m_rn->node_list;
-      } else {
+        m_n  = m_rn->node_list;
+      }
+      else
+      {
         m_rn = nullptr;
-        m_n = nullptr;
+        m_n  = nullptr;
       }
       return *this;
     }
@@ -497,7 +554,7 @@ class PinnedTally
       return ret;
     }
 
-    auto operator*() -> T(&)[num_slots] { return m_n->values; }
+    auto operator*() -> T (&)[num_slots] { return m_n->values; }
 
     bool operator==(const ResourceNodeIterator& rhs) const
     {
@@ -534,25 +591,27 @@ class PinnedTally
   ResourceNodeIterator end() { return {nullptr, nullptr}; }
 
   //! get new value for use in resource
-  auto new_value(::RAJA::resources::Hip res) -> T(&)[num_slots]
+  auto new_value(::RAJA::resources::Hip res) -> T (&)[num_slots]
   {
 #if defined(RAJA_ENABLE_OPENMP)
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
     ResourceNode* rn = resource_list;
-    while (rn) {
+    while (rn)
+    {
       if (rn->res.get_stream() == res.get_stream()) break;
       rn = rn->next;
     }
-    if (!rn) {
-      rn = (ResourceNode*)malloc(sizeof(ResourceNode));
-      rn->next = resource_list;
-      rn->res = res;
+    if (!rn)
+    {
+      rn            = (ResourceNode*)malloc(sizeof(ResourceNode));
+      rn->next      = resource_list;
+      rn->res       = res;
       rn->node_list = nullptr;
       resource_list = rn;
     }
-    Node* n = mempool::getInstance().template malloc<Node>(1);
-    n->next = rn->node_list;
+    Node* n       = mempool::getInstance().template malloc<Node>(1);
+    n->next       = rn->node_list;
     rn->node_list = n;
     return n->values;
   }
@@ -561,7 +620,8 @@ class PinnedTally
   void synchronize_resources()
   {
     auto end = resourceEnd();
-    for (auto r = resourceBegin(); r != end; ++r) {
+    for (auto r = resourceBegin(); r != end; ++r)
+    {
       ::RAJA::hip::synchronize(*r);
     }
   }
@@ -569,10 +629,12 @@ class PinnedTally
   //! all values used in all resources
   void free_list()
   {
-    while (resource_list) {
+    while (resource_list)
+    {
       ResourceNode* rn = resource_list;
-      while (rn->node_list) {
-        Node* n = rn->node_list;
+      while (rn->node_list)
+      {
+        Node* n       = rn->node_list;
         rn->node_list = n->next;
         mempool::getInstance().free(n);
       }
@@ -601,12 +663,15 @@ class PinnedTally
 
 //! Reduction data for Hip Offload -- stores value, host pointer, and device
 //! pointer
-template <typename Combiner, typename Accessor, typename T,
-          size_t replication, size_t atomic_stride>
+template <typename Combiner,
+          typename Accessor,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
 struct ReduceLastBlock_Data
 {
   using tally_mempool_type = pinned_mempool_type;
-  using data_mempool_type = device_mempool_type;
+  using data_mempool_type  = device_mempool_type;
   using count_mempool_type = device_zeroed_mempool_type;
 
   static constexpr size_t tally_slots = replication;
@@ -617,7 +682,7 @@ struct ReduceLastBlock_Data
   RAJA::detail::SoAPtr<T, data_mempool_type> device;
   bool own_device_ptr;
 
-  ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()){};
+  ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()) {};
 
   /*! \brief create from a default value and offload information
    *
@@ -625,31 +690,30 @@ struct ReduceLastBlock_Data
    */
 
   ReduceLastBlock_Data(T initValue, T identity_)
-      : value{initValue},
-        identity{identity_},
-        device_count{nullptr},
-        device{},
-        own_device_ptr{false}
-  {
-  }
+      : value {initValue},
+        identity {identity_},
+        device_count {nullptr},
+        device {},
+        own_device_ptr {false}
+  {}
 
   RAJA_HOST_DEVICE
   ReduceLastBlock_Data(const ReduceLastBlock_Data& other)
-      : value{other.identity},
-        identity{other.identity},
-        device_count{other.device_count},
-        device{other.device},
-        own_device_ptr{false}
-  {
-  }
+      : value {other.identity},
+        identity {other.identity},
+        device_count {other.device_count},
+        device {other.device},
+        own_device_ptr {false}
+  {}
 
   ReduceLastBlock_Data& operator=(const ReduceLastBlock_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r) {
+    for (size_t r = 0; r < tally_slots; ++r)
+    {
       output[r] = identity;
     }
     return &output[0];
@@ -660,10 +724,12 @@ struct ReduceLastBlock_Data
   void grid_reduce(T* output)
   {
     T temp = value;
-    size_t replicationId = impl::grid_reduce_last_block<
-        Combiner, Accessor, replication, atomic_stride>(
-          temp, identity, device, device_count);
-    if (replicationId != replication) {
+    size_t replicationId =
+        impl::grid_reduce_last_block<Combiner, Accessor, replication,
+                                     atomic_stride>(temp, identity, device,
+                                                    device_count);
+    if (replicationId != replication)
+    {
       output[replicationId] = temp;
     }
   }
@@ -673,13 +739,15 @@ struct ReduceLastBlock_Data
   bool setupForDevice()
   {
     bool act = !device.allocated() && setupReducers();
-    if (act) {
-      hip_dim_t gridDim = currentGridDim();
-      size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
+    if (act)
+    {
+      hip_dim_t gridDim  = currentGridDim();
+      size_t numBlocks   = gridDim.x * gridDim.y * gridDim.z;
       size_t maxNumSlots = (numBlocks + replication - 1) / replication;
-      device.allocate(maxNumSlots*replication);
-      device_count = count_mempool_type::getInstance()
-                         .template malloc<unsigned int>(replication*atomic_stride);
+      device.allocate(maxNumSlots * replication);
+      device_count =
+          count_mempool_type::getInstance().template malloc<unsigned int>(
+              replication * atomic_stride);
       own_device_ptr = true;
     }
     return act;
@@ -690,10 +758,11 @@ struct ReduceLastBlock_Data
   bool teardownForDevice()
   {
     bool act = own_device_ptr;
-    if (act) {
+    if (act)
+    {
       device.deallocate();
       count_mempool_type::getInstance().free(device_count);
-      device_count = nullptr;
+      device_count   = nullptr;
       own_device_ptr = false;
     }
     return act;
@@ -702,8 +771,10 @@ struct ReduceLastBlock_Data
 
 
 //! Reduction data for Hip Offload -- stores value, host pointer
-template <typename Combiner, typename T,
-          size_t replication, size_t atomic_stride>
+template <typename Combiner,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
 struct ReduceAtomicHostInit_Data
 {
   using tally_mempool_type = device_pinned_mempool_type;
@@ -715,32 +786,32 @@ struct ReduceAtomicHostInit_Data
   bool is_setup;
   bool own_device_ptr;
 
-  ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()){}
+  ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()) {}
 
   ReduceAtomicHostInit_Data(T initValue, T identity_)
-      : value{initValue},
-        identity{identity_},
-        is_setup{false},
-        own_device_ptr{false}
-  {
-  }
+      : value {initValue},
+        identity {identity_},
+        is_setup {false},
+        own_device_ptr {false}
+  {}
 
   RAJA_HOST_DEVICE
   ReduceAtomicHostInit_Data(const ReduceAtomicHostInit_Data& other)
-      : value{other.identity},
-        identity{other.identity},
-        is_setup{other.is_setup},
-        own_device_ptr{false}
-  {
-  }
+      : value {other.identity},
+        identity {other.identity},
+        is_setup {other.is_setup},
+        own_device_ptr {false}
+  {}
 
-  ReduceAtomicHostInit_Data& operator=(const ReduceAtomicHostInit_Data&) = default;
+  ReduceAtomicHostInit_Data&
+  operator=(const ReduceAtomicHostInit_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r) {
+    for (size_t r = 0; r < tally_slots; ++r)
+    {
       output[r] = identity;
     }
     return &output[0];
@@ -753,7 +824,7 @@ struct ReduceAtomicHostInit_Data
     T temp = value;
 
     impl::grid_reduce_atomic_host_init<Combiner, replication, atomic_stride>(
-            temp, identity, output);
+        temp, identity, output);
   }
 
   //! check and setup for device
@@ -761,8 +832,9 @@ struct ReduceAtomicHostInit_Data
   bool setupForDevice()
   {
     bool act = !is_setup && setupReducers();
-    if (act) {
-      is_setup = true;
+    if (act)
+    {
+      is_setup       = true;
       own_device_ptr = true;
     }
     return act;
@@ -773,8 +845,9 @@ struct ReduceAtomicHostInit_Data
   bool teardownForDevice()
   {
     bool act = own_device_ptr;
-    if (act) {
-      is_setup = false;
+    if (act)
+    {
+      is_setup       = false;
       own_device_ptr = false;
     }
     return act;
@@ -782,12 +855,15 @@ struct ReduceAtomicHostInit_Data
 };
 
 //! Reduction data for Hip Offload -- stores value, host pointer
-template <typename Combiner, typename Accessor, typename T,
-          size_t replication, size_t atomic_stride>
+template <typename Combiner,
+          typename Accessor,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
 struct ReduceAtomicDeviceInit_Data
 {
   using tally_mempool_type = pinned_mempool_type;
-  using data_mempool_type = device_mempool_type;
+  using data_mempool_type  = device_mempool_type;
   using count_mempool_type = device_zeroed_mempool_type;
 
   static constexpr size_t tally_slots = replication;
@@ -798,34 +874,34 @@ struct ReduceAtomicDeviceInit_Data
   T* device;
   bool own_device_ptr;
 
-  ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()){}
+  ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()) {}
 
   ReduceAtomicDeviceInit_Data(T initValue, T identity_)
-      : value{initValue},
-        identity{identity_},
-        device_count{nullptr},
-        device{nullptr},
-        own_device_ptr{false}
-  {
-  }
+      : value {initValue},
+        identity {identity_},
+        device_count {nullptr},
+        device {nullptr},
+        own_device_ptr {false}
+  {}
 
   RAJA_HOST_DEVICE
   ReduceAtomicDeviceInit_Data(const ReduceAtomicDeviceInit_Data& other)
-      : value{other.identity},
-        identity{other.identity},
-        device_count{other.device_count},
-        device{other.device},
-        own_device_ptr{false}
-  {
-  }
+      : value {other.identity},
+        identity {other.identity},
+        device_count {other.device_count},
+        device {other.device},
+        own_device_ptr {false}
+  {}
 
-  ReduceAtomicDeviceInit_Data& operator=(const ReduceAtomicDeviceInit_Data&) = default;
+  ReduceAtomicDeviceInit_Data&
+  operator=(const ReduceAtomicDeviceInit_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r) {
+    for (size_t r = 0; r < tally_slots; ++r)
+    {
       output[r] = identity;
     }
     return &output[0];
@@ -837,10 +913,12 @@ struct ReduceAtomicDeviceInit_Data
   {
     T temp = value;
 
-    size_t replicationId = impl::grid_reduce_atomic_device_init<
-        Combiner, Accessor, replication, atomic_stride>(
-          temp, identity, device, device_count);
-    if (replicationId != replication) {
+    size_t replicationId =
+        impl::grid_reduce_atomic_device_init<Combiner, Accessor, replication,
+                                             atomic_stride>(
+            temp, identity, device, device_count);
+    if (replicationId != replication)
+    {
       output[replicationId] = temp;
     }
   }
@@ -850,10 +928,13 @@ struct ReduceAtomicDeviceInit_Data
   bool setupForDevice()
   {
     bool act = !device && setupReducers();
-    if (act) {
-      device = data_mempool_type::getInstance().template malloc<T>(replication*atomic_stride);
-      device_count = count_mempool_type::getInstance()
-                         .template malloc<unsigned int>(replication*atomic_stride);
+    if (act)
+    {
+      device = data_mempool_type::getInstance().template malloc<T>(
+          replication * atomic_stride);
+      device_count =
+          count_mempool_type::getInstance().template malloc<unsigned int>(
+              replication * atomic_stride);
       own_device_ptr = true;
     }
     return act;
@@ -864,11 +945,12 @@ struct ReduceAtomicDeviceInit_Data
   bool teardownForDevice()
   {
     bool act = own_device_ptr;
-    if (act) {
+    if (act)
+    {
       data_mempool_type::getInstance().free(device);
       device = nullptr;
       count_mempool_type::getInstance().free(device_count);
-      device_count = nullptr;
+      device_count   = nullptr;
       own_device_ptr = false;
     }
     return act;
@@ -880,49 +962,77 @@ struct ReduceAtomicDeviceInit_Data
 template <typename Combiner, typename T, typename tuning>
 class Reduce
 {
-  static constexpr size_t replication = (tuning::replication > 0)
-      ? tuning::replication
-      : 32;
-  static constexpr size_t atomic_stride = (tuning::atomic_stride > 0)
-      ? tuning::atomic_stride
-      : ((policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
-        ? RAJA_DIVIDE_CEILING_INT(policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T))
-        : 1);
-
-  using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::block_fence),
+  static constexpr size_t replication =
+      (tuning::replication > 0) ? tuning::replication : 32;
+  static constexpr size_t atomic_stride =
+      (tuning::atomic_stride > 0)
+          ? tuning::atomic_stride
+          : ((policy::hip::device_constants
+                  .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
+                 ? RAJA_DIVIDE_CEILING_INT(
+                       policy::hip::device_constants
+                           .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE,
+                       sizeof(T))
+                 : 1);
+
+  using Accessor = std::conditional_t<
+      (tuning::comm_mode == block_communication_mode::block_fence),
       impl::AccessorDeviceScopeUseBlockFence,
-      std::conditional_t<(tuning::comm_mode == block_communication_mode::device_fence),
-        impl::AccessorDeviceScopeUseDeviceFence,
-        void>>;
+      std::conditional_t<(tuning::comm_mode ==
+                          block_communication_mode::device_fence),
+                         impl::AccessorDeviceScopeUseDeviceFence,
+                         void>>;
 
   static constexpr bool atomic_policy =
-      (tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block) ||
+      (tuning::algorithm ==
+       reduce_algorithm::init_device_combine_atomic_block) ||
       (tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block);
-  static constexpr bool atomic_available = RAJA::reduce::hip::hip_atomic_available<T>::value;
+  static constexpr bool atomic_available =
+      RAJA::reduce::hip::hip_atomic_available<T>::value;
 
   //! hip reduction data storage class and folding algorithm
-  using reduce_data_type = std::conditional_t<(tuning::algorithm == reduce_algorithm::combine_last_block) ||
-                                              (atomic_policy && !atomic_available),
-      hip::ReduceLastBlock_Data<Combiner, Accessor, T, replication, atomic_stride>,
-      std::conditional_t<atomic_available,
-        std::conditional_t<(tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block),
-          hip::ReduceAtomicDeviceInit_Data<Combiner, Accessor, T, replication, atomic_stride>,
-          std::conditional_t<(tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block),
-            hip::ReduceAtomicHostInit_Data<Combiner, T, replication, atomic_stride>,
-            void>>,
-        void>>;
+  using reduce_data_type = std::conditional_t<
+      (tuning::algorithm == reduce_algorithm::combine_last_block) ||
+          (atomic_policy && !atomic_available),
+      hip::ReduceLastBlock_Data<Combiner,
+                                Accessor,
+                                T,
+                                replication,
+                                atomic_stride>,
+      std::conditional_t<
+          atomic_available,
+          std::conditional_t<
+              (tuning::algorithm ==
+               reduce_algorithm::init_device_combine_atomic_block),
+              hip::ReduceAtomicDeviceInit_Data<Combiner,
+                                               Accessor,
+                                               T,
+                                               replication,
+                                               atomic_stride>,
+              std::conditional_t<
+                  (tuning::algorithm ==
+                   reduce_algorithm::init_host_combine_atomic_block),
+                  hip::ReduceAtomicHostInit_Data<Combiner,
+                                                 T,
+                                                 replication,
+                                                 atomic_stride>,
+                  void>>,
+          void>>;
 
   static constexpr size_t tally_slots = reduce_data_type::tally_slots;
 
-  using TallyType = PinnedTally<T, tally_slots, typename reduce_data_type::tally_mempool_type>;
+  using TallyType = PinnedTally<T,
+                                tally_slots,
+                                typename reduce_data_type::tally_mempool_type>;
 
   //! union to hold either pointer to PinnedTally or pointer to value
   //  only use list before setup for device and only use val_ptr after
-  union tally_u {
+  union tally_u
+  {
     TallyType* list;
     T* val_ptr;
-    constexpr tally_u(TallyType* l) : list(l){};
-    constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){};
+    constexpr tally_u(TallyType* l) : list(l) {};
+    constexpr tally_u(T* v_ptr) : val_ptr(v_ptr) {};
   };
 
 public:
@@ -931,11 +1041,10 @@ class Reduce
   //! create a reduce object
   //  the original object's parent is itself
   explicit Reduce(T init_val, T identity_ = Combiner::identity())
-      : parent{this},
-        tally_or_val_ptr{new TallyType},
+      : parent {this},
+        tally_or_val_ptr {new TallyType},
         val(init_val, identity_)
-  {
-  }
+  {}
 
   void reset(T in_val, T identity_ = Combiner::identity())
   {
@@ -949,16 +1058,18 @@ class Reduce
   RAJA_HOST_DEVICE
   Reduce(const Reduce& other)
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-      : parent{other.parent},
+      : parent {other.parent},
 #else
-      : parent{&other},
+      : parent {&other},
 #endif
-        tally_or_val_ptr{other.tally_or_val_ptr},
+        tally_or_val_ptr {other.tally_or_val_ptr},
         val(other.val)
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (parent) {
-      if (val.setupForDevice()) {
+    if (parent)
+    {
+      if (val.setupForDevice())
+      {
         tally_or_val_ptr.val_ptr = val.init_grid_vals(
             tally_or_val_ptr.list->new_value(currentResource()));
         parent = nullptr;
@@ -973,25 +1084,35 @@ class Reduce
   ~Reduce()
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (parent == this) {
+    if (parent == this)
+    {
       delete tally_or_val_ptr.list;
       tally_or_val_ptr.list = nullptr;
-    } else if (parent) {
-      if (val.value != val.identity) {
+    }
+    else if (parent)
+    {
+      if (val.value != val.identity)
+      {
 #if defined(RAJA_ENABLE_OPENMP)
         lock_guard<omp::mutex> lock(tally_or_val_ptr.list->m_mutex);
 #endif
         parent->combine(val.value);
       }
-    } else {
-      if (val.teardownForDevice()) {
+    }
+    else
+    {
+      if (val.teardownForDevice())
+      {
         tally_or_val_ptr.val_ptr = nullptr;
       }
     }
 #else
-    if (!parent->parent) {
+    if (!parent->parent)
+    {
       val.grid_reduce(tally_or_val_ptr.val_ptr);
-    } else {
+    }
+    else
+    {
       parent->combine(val.value);
     }
 #endif
@@ -1000,15 +1121,18 @@ class Reduce
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    auto n = tally_or_val_ptr.list->begin();
+    auto n   = tally_or_val_ptr.list->begin();
     auto end = tally_or_val_ptr.list->end();
-    if (n != end) {
+    if (n != end)
+    {
       tally_or_val_ptr.list->synchronize_resources();
       ::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
           reducer(std::move(val.value));
-      for (; n != end; ++n) {
+      for (; n != end; ++n)
+      {
         T(&values)[tally_slots] = *n;
-        for (size_t r = 0; r < tally_slots; ++r) {
+        for (size_t r = 0; r < tally_slots; ++r)
+        {
           reducer.combine(std::move(values[r]));
         }
       }
@@ -1022,7 +1146,7 @@ class Reduce
 
   //! apply reduction (const version) -- still combines internal values
   RAJA_HOST_DEVICE
-  void combine(T other) const { Combiner{}(val.value, other); }
+  void combine(T other) const { Combiner {}(val.value, other); }
 
   /*!
    *  \return reference to the local value
@@ -1132,33 +1256,39 @@ class ReduceMax<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
 //! specialization of ReduceMinLoc for hip_reduce
 template <typename tuning, typename T, typename IndexType>
 class ReduceMinLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
-    : public hip::Reduce<RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
-                          RAJA::reduce::detail::ValueLoc<T, IndexType>,
-                          tuning>
+    : public hip::Reduce<
+          RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
+          RAJA::reduce::detail::ValueLoc<T, IndexType>,
+          tuning>
 {
 
 public:
-  using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType>;
-  using Combiner = RAJA::reduce::min<value_type>;
+  using value_type     = RAJA::reduce::detail::ValueLoc<T, IndexType>;
+  using Combiner       = RAJA::reduce::min<value_type>;
   using NonLocCombiner = RAJA::reduce::min<T>;
-  using Base = hip::Reduce<Combiner, value_type, tuning>;
+  using Base           = hip::Reduce<Combiner, value_type, tuning>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMinLoc(T init_val, IndexType init_idx,
+  ReduceMinLoc(T init_val,
+               IndexType init_idx,
                T identity_val = NonLocCombiner::identity(),
-               IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx), value_type(identity_val, identity_idx))
-  {
-  }
+               IndexType identity_idx =
+                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val, identity_idx))
+  {}
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T init_val, IndexType init_idx,
+  void reset(T init_val,
+             IndexType init_idx,
              T identity_val = NonLocCombiner::identity(),
-             IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_idx =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val, identity_idx));
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
@@ -1182,33 +1312,39 @@ class ReduceMinLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
 //! specialization of ReduceMaxLoc for hip_reduce
 template <typename tuning, typename T, typename IndexType>
 class ReduceMaxLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
-    : public hip::
-          Reduce<RAJA::reduce::max<RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
-                 RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
-                 tuning>
+    : public hip::Reduce<
+          RAJA::reduce::max<
+              RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
+          RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
+          tuning>
 {
 public:
-  using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
-  using Combiner = RAJA::reduce::max<value_type>;
+  using value_type     = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
+  using Combiner       = RAJA::reduce::max<value_type>;
   using NonLocCombiner = RAJA::reduce::max<T>;
-  using Base = hip::Reduce<Combiner, value_type, tuning>;
+  using Base           = hip::Reduce<Combiner, value_type, tuning>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMaxLoc(T init_val, IndexType init_idx,
+  ReduceMaxLoc(T init_val,
+               IndexType init_idx,
                T identity_val = NonLocCombiner::identity(),
-               IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx), value_type(identity_val, identity_idx))
-  {
-  }
+               IndexType identity_idx =
+                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val, identity_idx))
+  {}
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T init_val, IndexType init_idx,
+  void reset(T init_val,
+             IndexType init_idx,
              T identity_val = NonLocCombiner::identity(),
-             IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_idx =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val, identity_idx));
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
diff --git a/include/RAJA/policy/hip/scan.hpp b/include/RAJA/policy/hip/scan.hpp
index cdf0a9b82d..17f91e5e2a 100644
--- a/include/RAJA/policy/hip/scan.hpp
+++ b/include/RAJA/policy/hip/scan.hpp
@@ -53,11 +53,10 @@ template <typename IterationMapping,
           bool Async,
           typename InputIter,
           typename Function>
-RAJA_INLINE
-resources::EventProxy<resources::Hip>
-inclusive_inplace(
+RAJA_INLINE resources::EventProxy<resources::Hip> inclusive_inplace(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     InputIter begin,
     InputIter end,
     Function binary_op)
@@ -66,23 +65,14 @@ inclusive_inplace(
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      begin,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::inclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      begin, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             begin,
-                                             binary_op,
-                                             len,
+  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, begin, binary_op, len,
                                              stream));
 #endif
 
@@ -92,20 +82,11 @@ inclusive_inplace(
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      begin,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::inclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      begin, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             begin,
-                                             binary_op,
-                                             len,
+  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, begin, binary_op, len,
                                              stream));
 #endif
   // Free temporary storage
@@ -127,11 +108,10 @@ template <typename IterationMapping,
           typename InputIter,
           typename Function,
           typename T>
-RAJA_INLINE
-resources::EventProxy<resources::Hip>
-exclusive_inplace(
+RAJA_INLINE resources::EventProxy<resources::Hip> exclusive_inplace(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     InputIter begin,
     InputIter end,
     Function binary_op,
@@ -141,25 +121,14 @@ exclusive_inplace(
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::exclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      begin,
-                                      init,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::exclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      begin, init, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             begin,
-                                             binary_op,
-                                             init,
-                                             len,
+  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, begin, binary_op, init, len,
                                              stream));
 #endif
   // Allocate temporary storage
@@ -168,22 +137,11 @@ exclusive_inplace(
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::exclusive_scan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              begin,
-                                              init,
-                                              len,
-                                              binary_op,
-                                              stream));
+  hipErrchk(::rocprim::exclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      begin, init, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             begin,
-                                             binary_op,
-                                             init,
-                                             len,
+  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, begin, binary_op, init, len,
                                              stream));
 #endif
   // Free temporary storage
@@ -205,38 +163,27 @@ template <typename IterationMapping,
           typename InputIter,
           typename OutputIter,
           typename Function>
-RAJA_INLINE
-resources::EventProxy<resources::Hip>
-inclusive(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    InputIter begin,
-    InputIter end,
-    OutputIter out,
-    Function binary_op)
+RAJA_INLINE resources::EventProxy<resources::Hip>
+inclusive(resources::Hip hip_res,
+          ::RAJA::policy::hip::
+              hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+          InputIter begin,
+          InputIter end,
+          OutputIter out,
+          Function binary_op)
 {
   hipStream_t stream = hip_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      out,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::inclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      out, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             out,
-                                             binary_op,
-                                             len,
-                                             stream));
+  hipErrchk(::cub::DeviceScan::InclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -244,21 +191,11 @@ inclusive(
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      out,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::inclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      out, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             out,
-                                             binary_op,
-                                             len,
-                                             stream));
+  hipErrchk(::cub::DeviceScan::InclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
@@ -280,40 +217,28 @@ template <typename IterationMapping,
           typename OutputIter,
           typename Function,
           typename T>
-RAJA_INLINE
-resources::EventProxy<resources::Hip>
-exclusive(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    InputIter begin,
-    InputIter end,
-    OutputIter out,
-    Function binary_op,
-    T init)
+RAJA_INLINE resources::EventProxy<resources::Hip>
+exclusive(resources::Hip hip_res,
+          ::RAJA::policy::hip::
+              hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+          InputIter begin,
+          InputIter end,
+          OutputIter out,
+          Function binary_op,
+          T init)
 {
   hipStream_t stream = hip_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::exclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      out,
-                                      init,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::exclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      out, init, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             out,
-                                             binary_op,
-                                             init,
-                                             len,
+  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, out, binary_op, init, len,
                                              stream));
 #endif
   // Allocate temporary storage
@@ -322,22 +247,11 @@ exclusive(
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::exclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      out,
-                                      init,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::exclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      out, init, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             out,
-                                             binary_op,
-                                             init,
-                                             len,
+  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, out, binary_op, init, len,
                                              stream));
 #endif
   // Free temporary storage
diff --git a/include/RAJA/policy/hip/sort.hpp b/include/RAJA/policy/hip/sort.hpp
index eb16246623..c0bc8808cb 100644
--- a/include/RAJA/policy/hip/sort.hpp
+++ b/include/RAJA/policy/hip/sort.hpp
@@ -51,52 +51,63 @@ namespace detail
 {
 
 #if defined(__HIPCC__)
-  template < typename R >
-  using double_buffer = ::rocprim::double_buffer<R>;
+template <typename R>
+using double_buffer = ::rocprim::double_buffer<R>;
 #elif defined(__CUDACC__)
-  template < typename R >
-  using double_buffer = ::cub::DoubleBuffer<R>;
+template <typename R>
+using double_buffer = ::cub::DoubleBuffer<R>;
 #endif
 
-  template < typename R >
-  R* get_current(double_buffer<R>& d_bufs)
-  {
+template <typename R>
+R* get_current(double_buffer<R>& d_bufs)
+{
 #if defined(__HIPCC__)
-    return d_bufs.current();
+  return d_bufs.current();
 #elif defined(__CUDACC__)
-    return d_bufs.Current();
+  return d_bufs.Current();
 #endif
-  }
-
 }
 
+}  // namespace detail
+
 /*!
         \brief static assert unimplemented stable sort
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename Iter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                        std::is_pointer<Iter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-stable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    Iter,
-    Iter,
-    Compare)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename Iter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+        std::is_pointer<Iter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<Iter>>>,
+            camp::is_same<Compare,
+                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+stable(resources::Hip hip_res,
+       ::RAJA::policy::hip::
+           hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+       Iter,
+       Iter,
+       Compare)
 {
-  static_assert(concepts::all_of<
-                  type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                  std::is_pointer<Iter>,
-                  concepts::any_of<
-                    camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                    camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>::value,
-                "RAJA stable_sort<hip_exec> is only implemented for pointers to arithmetic types and RAJA::operators::less and RAJA::operators::greater.");
+  static_assert(
+      concepts::all_of<
+          type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+          std::is_pointer<Iter>,
+          concepts::any_of<
+              camp::is_same<Compare,
+                            operators::less<RAJA::detail::IterVal<Iter>>>,
+              camp::is_same<Compare, operators::greater<
+                                         RAJA::detail::IterVal<Iter>>>>>::value,
+      "RAJA stable_sort<hip_exec> is only implemented for pointers to "
+      "arithmetic types and RAJA::operators::less and "
+      "RAJA::operators::greater.");
 
   return resources::EventProxy<resources::Hip>(hip_res);
 }
@@ -104,26 +115,28 @@ stable(
 /*!
         \brief stable sort given range in ascending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-stable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    Iter begin,
-    Iter end,
-    operators::less<RAJA::detail::IterVal<Iter>>)
+stable(resources::Hip hip_res,
+       ::RAJA::policy::hip::
+           hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+       Iter begin,
+       Iter end,
+       operators::less<RAJA::detail::IterVal<Iter>>)
 {
   hipStream_t stream = hip_res.get_stream();
 
   using R = RAJA::detail::IterVal<Iter>;
 
-  int len = std::distance(begin, end);
-  int begin_bit=0;
-  int end_bit=sizeof(R)*CHAR_BIT;
+  int len       = std::distance(begin, end);
+  int begin_bit = 0;
+  int end_bit   = sizeof(R) * CHAR_BIT;
 
   // Allocate temporary storage for the output array
   R* d_out = hip::device_mempool_type::getInstance().malloc<R>(len);
@@ -133,24 +146,16 @@ stable(
   detail::double_buffer<R> d_keys(begin, d_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_keys(d_temp_storage,
-                                       temp_storage_bytes,
-                                       d_keys,
-                                       len,
-                                       begin_bit,
-                                       end_bit,
+  hipErrchk(::rocprim::radix_sort_keys(d_temp_storage, temp_storage_bytes,
+                                       d_keys, len, begin_bit, end_bit,
                                        stream));
 #elif defined(__CUDACC__)
   cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
-                                              temp_storage_bytes,
-                                              d_keys,
-                                              len,
-                                              begin_bit,
-                                              end_bit,
-                                              stream));
+                                              temp_storage_bytes, d_keys, len,
+                                              begin_bit, end_bit, stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -159,29 +164,23 @@ stable(
 
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_keys(d_temp_storage,
-                                       temp_storage_bytes,
-                                       d_keys,
-                                       len,
-                                       begin_bit,
-                                       end_bit,
+  hipErrchk(::rocprim::radix_sort_keys(d_temp_storage, temp_storage_bytes,
+                                       d_keys, len, begin_bit, end_bit,
                                        stream));
 #elif defined(__CUDACC__)
   cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
-                                              temp_storage_bytes,
-                                              d_keys,
-                                              len,
-                                              begin_bit,
-                                              end_bit,
-                                              stream));
+                                              temp_storage_bytes, d_keys, len,
+                                              begin_bit, end_bit, stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (detail::get_current(d_keys) == d_out) {
+  if (detail::get_current(d_keys) == d_out)
+  {
 
     // copy
-    hipErrchk(hipMemcpyAsync(begin, d_out, len*sizeof(R), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(begin, d_out, len * sizeof(R), hipMemcpyDefault,
+                             stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_out);
@@ -194,26 +193,28 @@ stable(
 /*!
         \brief stable sort given range in descending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-stable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    Iter begin,
-    Iter end,
-    operators::greater<RAJA::detail::IterVal<Iter>>)
+stable(resources::Hip hip_res,
+       ::RAJA::policy::hip::
+           hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+       Iter begin,
+       Iter end,
+       operators::greater<RAJA::detail::IterVal<Iter>>)
 {
   hipStream_t stream = hip_res.get_stream();
 
   using R = RAJA::detail::IterVal<Iter>;
 
-  int len = std::distance(begin, end);
-  int begin_bit=0;
-  int end_bit=sizeof(R)*CHAR_BIT;
+  int len       = std::distance(begin, end);
+  int begin_bit = 0;
+  int end_bit   = sizeof(R) * CHAR_BIT;
 
   // Allocate temporary storage for the output array
   R* d_out = hip::device_mempool_type::getInstance().malloc<R>(len);
@@ -223,24 +224,16 @@ stable(
   detail::double_buffer<R> d_keys(begin, d_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_keys_desc(d_temp_storage,
-                                            temp_storage_bytes,
-                                            d_keys,
-                                            len,
-                                            begin_bit,
-                                            end_bit,
+  hipErrchk(::rocprim::radix_sort_keys_desc(d_temp_storage, temp_storage_bytes,
+                                            d_keys, len, begin_bit, end_bit,
                                             stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
-                                                        temp_storage_bytes,
-                                                        d_keys,
-                                                        len,
-                                                        begin_bit,
-                                                        end_bit,
-                                                        stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
+      stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -249,29 +242,23 @@ stable(
 
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_keys_desc(d_temp_storage,
-                                            temp_storage_bytes,
-                                            d_keys,
-                                            len,
-                                            begin_bit,
-                                            end_bit,
+  hipErrchk(::rocprim::radix_sort_keys_desc(d_temp_storage, temp_storage_bytes,
+                                            d_keys, len, begin_bit, end_bit,
                                             stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
-                                                        temp_storage_bytes,
-                                                        d_keys,
-                                                        len,
-                                                        begin_bit,
-                                                        end_bit,
-                                                        stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
+      stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (detail::get_current(d_keys) == d_out) {
+  if (detail::get_current(d_keys) == d_out)
+  {
 
     // copy
-    hipErrchk(hipMemcpyAsync(begin, d_out, len*sizeof(R), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(begin, d_out, len * sizeof(R), hipMemcpyDefault,
+                             stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_out);
@@ -285,30 +272,40 @@ stable(
 /*!
         \brief static assert unimplemented sort
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename Iter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                        std::is_pointer<Iter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-unstable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    Iter,
-    Iter,
-    Compare)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename Iter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+        std::is_pointer<Iter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<Iter>>>,
+            camp::is_same<Compare,
+                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+unstable(resources::Hip hip_res,
+         ::RAJA::policy::hip::
+             hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+         Iter,
+         Iter,
+         Compare)
 {
-  static_assert(concepts::all_of<
-                  type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                  std::is_pointer<Iter>,
-                  concepts::any_of<
-                    camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                    camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>::value,
-                "RAJA sort<hip_exec> is only implemented for pointers to arithmetic types and RAJA::operators::less and RAJA::operators::greater.");
+  static_assert(
+      concepts::all_of<
+          type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+          std::is_pointer<Iter>,
+          concepts::any_of<
+              camp::is_same<Compare,
+                            operators::less<RAJA::detail::IterVal<Iter>>>,
+              camp::is_same<Compare, operators::greater<
+                                         RAJA::detail::IterVal<Iter>>>>>::value,
+      "RAJA sort<hip_exec> is only implemented for pointers to arithmetic "
+      "types and RAJA::operators::less and RAJA::operators::greater.");
 
   return resources::EventProxy<resources::Hip>(hip_res);
 }
@@ -316,18 +313,20 @@ unstable(
 /*!
         \brief sort given range in ascending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-unstable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
-    Iter begin,
-    Iter end,
-    operators::less<RAJA::detail::IterVal<Iter>> comp)
+unstable(resources::Hip hip_res,
+         ::RAJA::policy::hip::
+             hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
+         Iter begin,
+         Iter end,
+         operators::less<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(hip_res, p, begin, end, comp);
 }
@@ -335,18 +334,20 @@ unstable(
 /*!
         \brief sort given range in descending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-unstable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
-    Iter begin,
-    Iter end,
-    operators::greater<RAJA::detail::IterVal<Iter>> comp)
+unstable(resources::Hip hip_res,
+         ::RAJA::policy::hip::
+             hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
+         Iter begin,
+         Iter end,
+         operators::greater<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(hip_res, p, begin, end, comp);
 }
@@ -355,36 +356,47 @@ unstable(
 /*!
         \brief static assert unimplemented stable sort pairs
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                        std::is_pointer<KeyIter>,
-                        std::is_pointer<ValIter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+        std::is_pointer<KeyIter>,
+        std::is_pointer<ValIter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
+            camp::is_same<
+                Compare,
+                operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
 stable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter,
     KeyIter,
     ValIter,
     Compare)
 {
-  static_assert (std::is_pointer<KeyIter>::value,
-      "stable_sort_pairs<hip_exec> is only implemented for pointers");
-  static_assert (std::is_pointer<ValIter>::value,
-      "stable_sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<KeyIter>::value,
+                "stable_sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<ValIter>::value,
+                "stable_sort_pairs<hip_exec> is only implemented for pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
-  static_assert (type_traits::is_arithmetic<K>::value,
+  static_assert(
+      type_traits::is_arithmetic<K>::value,
       "stable_sort_pairs<hip_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<K>>,
-      camp::is_same<Compare, operators::greater<K>>>::value,
-      "stable_sort_pairs<hip_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(
+      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
+                       camp::is_same<Compare, operators::greater<K>>>::value,
+      "stable_sort_pairs<hip_exec> is only implemented for "
+      "RAJA::operators::less or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Hip>(hip_res);
 }
@@ -392,16 +404,21 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
 stable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -412,9 +429,9 @@ stable_pairs(
   using K = RAJA::detail::IterVal<KeyIter>;
   using V = RAJA::detail::IterVal<ValIter>;
 
-  int len = std::distance(keys_begin, keys_end);
-  int begin_bit=0;
-  int end_bit=sizeof(K)*CHAR_BIT;
+  int len       = std::distance(keys_begin, keys_end);
+  int begin_bit = 0;
+  int end_bit   = sizeof(K) * CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
   K* d_keys_out = hip::device_mempool_type::getInstance().malloc<K>(len);
@@ -426,26 +443,16 @@ stable_pairs(
   detail::double_buffer<V> d_vals(vals_begin, d_vals_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_pairs(d_temp_storage,
-                                        temp_storage_bytes,
-                                        d_keys,
-                                        d_vals,
-                                        len,
-                                        begin_bit,
-                                        end_bit,
+  hipErrchk(::rocprim::radix_sort_pairs(d_temp_storage, temp_storage_bytes,
+                                        d_keys, d_vals, len, begin_bit, end_bit,
                                         stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortPairs(d_temp_storage,
-                                               temp_storage_bytes,
-                                               d_keys,
-                                               d_vals,
-                                               len,
-                                               begin_bit,
-                                               end_bit,
-                                               stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairs(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -454,36 +461,30 @@ stable_pairs(
 
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_pairs(d_temp_storage,
-                                        temp_storage_bytes,
-                                        d_keys,
-                                        d_vals,
-                                        len,
-                                        begin_bit,
-                                        end_bit,
+  hipErrchk(::rocprim::radix_sort_pairs(d_temp_storage, temp_storage_bytes,
+                                        d_keys, d_vals, len, begin_bit, end_bit,
                                         stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortPairs(d_temp_storage,
-                                               temp_storage_bytes,
-                                               d_keys,
-                                               d_vals,
-                                               len,
-                                               begin_bit,
-                                               end_bit,
-                                               stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairs(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (detail::get_current(d_keys) == d_keys_out) {
+  if (detail::get_current(d_keys) == d_keys_out)
+  {
 
     // copy keys
-    hipErrchk(hipMemcpyAsync(keys_begin, d_keys_out, len*sizeof(K), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(keys_begin, d_keys_out, len * sizeof(K),
+                             hipMemcpyDefault, stream));
   }
-  if (detail::get_current(d_vals) == d_vals_out) {
+  if (detail::get_current(d_vals) == d_vals_out)
+  {
 
     // copy vals
-    hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len * sizeof(V),
+                             hipMemcpyDefault, stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_keys_out);
@@ -497,16 +498,21 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
 stable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -517,9 +523,9 @@ stable_pairs(
   using K = RAJA::detail::IterVal<KeyIter>;
   using V = RAJA::detail::IterVal<ValIter>;
 
-  int len = std::distance(keys_begin, keys_end);
-  int begin_bit=0;
-  int end_bit=sizeof(K)*CHAR_BIT;
+  int len       = std::distance(keys_begin, keys_end);
+  int begin_bit = 0;
+  int end_bit   = sizeof(K) * CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
   K* d_keys_out = hip::device_mempool_type::getInstance().malloc<K>(len);
@@ -531,26 +537,16 @@ stable_pairs(
   detail::double_buffer<V> d_vals(vals_begin, d_vals_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_pairs_desc(d_temp_storage,
-                                             temp_storage_bytes,
-                                             d_keys,
-                                             d_vals,
-                                             len,
-                                             begin_bit,
-                                             end_bit,
-                                             stream));
+  hipErrchk(::rocprim::radix_sort_pairs_desc(d_temp_storage, temp_storage_bytes,
+                                             d_keys, d_vals, len, begin_bit,
+                                             end_bit, stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
-                                                         temp_storage_bytes,
-                                                         d_keys,
-                                                         d_vals,
-                                                         len,
-                                                         begin_bit,
-                                                         end_bit,
-                                                         stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -559,36 +555,30 @@ stable_pairs(
 
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_pairs_desc(d_temp_storage,
-                                             temp_storage_bytes,
-                                             d_keys,
-                                             d_vals,
-                                             len,
-                                             begin_bit,
-                                             end_bit,
-                                             stream));
+  hipErrchk(::rocprim::radix_sort_pairs_desc(d_temp_storage, temp_storage_bytes,
+                                             d_keys, d_vals, len, begin_bit,
+                                             end_bit, stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
-                                                         temp_storage_bytes,
-                                                         d_keys,
-                                                         d_vals,
-                                                         len,
-                                                         begin_bit,
-                                                         end_bit,
-                                                         stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (detail::get_current(d_keys) == d_keys_out) {
+  if (detail::get_current(d_keys) == d_keys_out)
+  {
 
     // copy keys
-    hipErrchk(hipMemcpyAsync(keys_begin, d_keys_out, len*sizeof(K), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(keys_begin, d_keys_out, len * sizeof(K),
+                             hipMemcpyDefault, stream));
   }
-  if (detail::get_current(d_vals) == d_vals_out) {
+  if (detail::get_current(d_vals) == d_vals_out)
+  {
 
     // copy vals
-    hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len * sizeof(V),
+                             hipMemcpyDefault, stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_keys_out);
@@ -603,36 +593,47 @@ stable_pairs(
 /*!
         \brief static assert unimplemented sort pairs
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                        std::is_pointer<KeyIter>,
-                        std::is_pointer<ValIter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+        std::is_pointer<KeyIter>,
+        std::is_pointer<ValIter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
+            camp::is_same<
+                Compare,
+                operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
 unstable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter,
     KeyIter,
     ValIter,
     Compare)
 {
-  static_assert (std::is_pointer<KeyIter>::value,
-      "sort_pairs<hip_exec> is only implemented for pointers");
-  static_assert (std::is_pointer<ValIter>::value,
-      "sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<KeyIter>::value,
+                "sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<ValIter>::value,
+                "sort_pairs<hip_exec> is only implemented for pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
-  static_assert (type_traits::is_arithmetic<K>::value,
+  static_assert(
+      type_traits::is_arithmetic<K>::value,
       "sort_pairs<hip_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<K>>,
-      camp::is_same<Compare, operators::greater<K>>>::value,
-      "sort_pairs<hip_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(
+      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
+                       camp::is_same<Compare, operators::greater<K>>>::value,
+      "sort_pairs<hip_exec> is only implemented for RAJA::operators::less or "
+      "RAJA::operators::greater");
 
   return resources::EventProxy<resources::Hip>(hip_res);
 }
@@ -640,16 +641,21 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
 unstable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -661,16 +667,21 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
 unstable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
diff --git a/include/RAJA/policy/openmp.hpp b/include/RAJA/policy/openmp.hpp
index fc29dabcbf..89a7997b31 100644
--- a/include/RAJA/policy/openmp.hpp
+++ b/include/RAJA/policy/openmp.hpp
@@ -30,7 +30,7 @@
 #include <thread>
 
 #if !defined(RAJA_ENABLE_DESUL_ATOMICS)
-    #include "RAJA/policy/openmp/atomic.hpp"
+#include "RAJA/policy/openmp/atomic.hpp"
 #endif
 
 #include "RAJA/policy/openmp/forall.hpp"
diff --git a/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp
index 09861941ab..8a3263bfd2 100644
--- a/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp
@@ -32,12 +32,12 @@ namespace detail
 {
 
 /*!
-* Populate and return a Dispatcher object
-*/
-template < typename T, typename Dispatcher_T >
+ * Populate and return a Dispatcher object
+ */
+template <typename T, typename Dispatcher_T>
 inline const Dispatcher_T* get_Dispatcher(omp_work const&)
 {
-  return get_Dispatcher<T, Dispatcher_T>(seq_work{});
+  return get_Dispatcher<T, Dispatcher_T>(seq_work {});
 }
 
 }  // namespace detail
diff --git a/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp
index c889273a0f..f566ac741b 100644
--- a/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp
@@ -38,23 +38,21 @@ namespace detail
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::omp_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallOrdered<
-        RAJA::omp_parallel_for_exec,
-        RAJA::omp_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+          typename... Args>
+struct WorkRunner<RAJA::omp_work,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallOrdered<RAJA::omp_parallel_for_exec,
+                              RAJA::omp_work,
+                              RAJA::ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
+{};
 
 /*!
  * Runs work in a storage container in reverse order
@@ -63,23 +61,21 @@ struct WorkRunner<
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::omp_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallReverse<
-        RAJA::omp_parallel_for_exec,
-        RAJA::omp_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+          typename... Args>
+struct WorkRunner<RAJA::omp_work,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallReverse<RAJA::omp_parallel_for_exec,
+                              RAJA::omp_work,
+                              RAJA::reverse_ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
+{};
 
 }  // namespace detail
 
diff --git a/include/RAJA/policy/openmp/atomic.hpp b/include/RAJA/policy/openmp/atomic.hpp
index 2dc047dd95..43e790759d 100644
--- a/include/RAJA/policy/openmp/atomic.hpp
+++ b/include/RAJA/policy/openmp/atomic.hpp
@@ -35,8 +35,7 @@ namespace RAJA
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicLoad(omp_atomic, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicLoad(omp_atomic, T* acc)
 {
   T ret;
 #pragma omp atomic capture
@@ -49,13 +48,12 @@ RAJA_INLINE T atomicLoad(omp_atomic, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE void atomicStore(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE void atomicStore(omp_atomic, T* acc, T value)
 {
   T ret;
 #pragma omp atomic capture
   {
-    ret = *acc;
+    ret  = *acc;
     *acc = value;
   }
   RAJA_UNUSED_VAR(ret);
@@ -63,8 +61,7 @@ RAJA_INLINE void atomicStore(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAdd(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAdd(omp_atomic, T* acc, T value)
 {
   T old;
 #pragma omp atomic capture
@@ -78,8 +75,7 @@ RAJA_INLINE T atomicAdd(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicSub(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicSub(omp_atomic, T* acc, T value)
 {
   T old;
 #pragma omp atomic capture
@@ -93,15 +89,14 @@ RAJA_INLINE T atomicSub(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMin(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMin(omp_atomic, T* acc, T value)
 {
 #if _OPENMP >= 202011
   T old;
-  #pragma omp atomic capture compare
+#pragma omp atomic capture compare
   {
     old = *acc;
-    if ( value < *acc )
+    if (value < *acc)
     {
       *acc = value;
     }
@@ -109,21 +104,20 @@ RAJA_INLINE T atomicMin(omp_atomic, T *acc, T value)
   return old;
 #else
   // OpenMP doesn't define atomic ternary operators so use builtin atomics
-  return atomicMin(builtin_atomic{}, acc, value);
+  return atomicMin(builtin_atomic {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMax(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMax(omp_atomic, T* acc, T value)
 {
 #if _OPENMP >= 202011
   T old;
-  #pragma omp atomic capture compare
+#pragma omp atomic capture compare
   {
     old = *acc;
-    if ( value > *acc )
+    if (value > *acc)
     {
       *acc = value;
     }
@@ -131,15 +125,14 @@ RAJA_INLINE T atomicMax(omp_atomic, T *acc, T value)
   return old;
 #else
   // OpenMP doesn't define atomic ternary operators so use builtin atomics
-  return atomicMax(builtin_atomic{}, acc, value);
+  return atomicMax(builtin_atomic {}, acc, value);
 #endif
 }
 
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(omp_atomic, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(omp_atomic, T* acc)
 {
   T old;
 #pragma omp atomic capture
@@ -153,18 +146,16 @@ RAJA_INLINE T atomicInc(omp_atomic, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(omp_atomic, T* acc, T value)
 {
   // OpenMP doesn't define needed operations, so use builtin atomics
-  return RAJA::atomicInc(builtin_atomic{}, acc, value);
+  return RAJA::atomicInc(builtin_atomic {}, acc, value);
 }
 
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(omp_atomic, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(omp_atomic, T* acc)
 {
   T old;
 #pragma omp atomic capture
@@ -178,17 +169,15 @@ RAJA_INLINE T atomicDec(omp_atomic, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(omp_atomic, T* acc, T value)
 {
   // OpenMP doesn't define needed operations, so use builtin atomics
-  return RAJA::atomicDec(builtin_atomic{}, acc, value);
+  return RAJA::atomicDec(builtin_atomic {}, acc, value);
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAnd(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAnd(omp_atomic, T* acc, T value)
 {
   T old;
 #pragma omp atomic capture
@@ -201,8 +190,7 @@ RAJA_INLINE T atomicAnd(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicOr(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicOr(omp_atomic, T* acc, T value)
 {
   T old;
 #pragma omp atomic capture
@@ -215,8 +203,7 @@ RAJA_INLINE T atomicOr(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicXor(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicXor(omp_atomic, T* acc, T value)
 {
   T old;
 #pragma omp atomic capture
@@ -229,13 +216,12 @@ RAJA_INLINE T atomicXor(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicExchange(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicExchange(omp_atomic, T* acc, T value)
 {
   T old;
 #pragma omp atomic capture
   {
-    old = *acc;  // capture old for return value
+    old  = *acc;  // capture old for return value
     *acc = value;
   }
   return old;
@@ -243,14 +229,13 @@ RAJA_INLINE T atomicExchange(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicCAS(omp_atomic, T *acc, T compare, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicCAS(omp_atomic, T* acc, T compare, T value)
 {
   // OpenMP doesn't define atomic ternary operators so use builtin atomics
-  return RAJA::atomicCAS(builtin_atomic{}, acc, compare, value);
+  return RAJA::atomicCAS(builtin_atomic {}, acc, compare, value);
 }
 
-#endif // not defined RAJA_COMPILER_MSVC
+#endif  // not defined RAJA_COMPILER_MSVC
 
 
 }  // namespace RAJA
diff --git a/include/RAJA/policy/openmp/forall.hpp b/include/RAJA/policy/openmp/forall.hpp
index 815168ae98..b842a9bfc5 100644
--- a/include/RAJA/policy/openmp/forall.hpp
+++ b/include/RAJA/policy/openmp/forall.hpp
@@ -55,23 +55,27 @@ namespace policy
 namespace omp
 {
 
-template <typename Iterable, typename Func, typename InnerPolicy, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+template <typename Iterable,
+          typename Func,
+          typename InnerPolicy,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Host host_res,
             const omp_parallel_exec<InnerPolicy>&,
             Iterable&& iter,
             Func&& loop_body,
             ForallParam f_params)
 {
-  RAJA::region<RAJA::omp_parallel_region>([&]() {
-    using RAJA::internal::thread_privatize;
-    auto body = thread_privatize(loop_body);
-    forall_impl(host_res, InnerPolicy{}, iter, body.get_priv(), f_params);
-  });
+  RAJA::region<RAJA::omp_parallel_region>(
+      [&]()
+      {
+        using RAJA::internal::thread_privatize;
+        auto body = thread_privatize(loop_body);
+        forall_impl(host_res, InnerPolicy {}, iter, body.get_priv(), f_params);
+      });
   return resources::EventProxy<resources::Host>(host_res);
 }
 
@@ -83,249 +87,283 @@ forall_impl(resources::Host host_res,
 namespace internal
 {
 
-  /// Tag dispatch for omp forall
+/// Tag dispatch for omp forall
 
-  //
-  // omp for (Auto)
-  //
-  template <typename Iterable, typename Func>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Auto&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for (Auto)
+//
+template <typename Iterable, typename Func>
+RAJA_INLINE void
+forall_impl(const ::RAJA::policy::omp::Auto&, Iterable&& iter, Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(static)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(static)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(static)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(static)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(static, ChunkSize)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(static, ChunkSize)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(static, ChunkSize)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(static, ChunkSize)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
-  }
-
-  //
-  // omp for schedule(dynamic)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
-  {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(dynamic)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(dynamic, ChunkSize)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(dynamic)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(dynamic)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(dynamic, ChunkSize)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(guided)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(dynamic, ChunkSize)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(dynamic, ChunkSize)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(guided)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(guided, ChunkSize)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(guided)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(guided)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(guided, ChunkSize)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(runtime)
-  //
-  template <typename Iterable, typename Func>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(guided, ChunkSize)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(guided, ChunkSize)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(runtime)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  // TODO :: not implemented in forall param interface ...
-  #if !defined(RAJA_COMPILER_MSVC)
-  // dynamic & guided
-  template <typename Policy, typename Iterable, typename Func>
-  RAJA_INLINE void forall_impl(const Policy&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(runtime)
+//
+template <typename Iterable, typename Func>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(runtime)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    omp_sched_t prev_sched;
-    int prev_chunk;
-    omp_get_schedule(&prev_sched, &prev_chunk);
-    omp_set_schedule(Policy::schedule, Policy::chunk_size);
-    forall_impl(::RAJA::policy::omp::Runtime{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
-    omp_set_schedule(prev_sched, prev_chunk);
+    loop_body(begin_it[i]);
   }
-  #endif
+}
 
+// TODO :: not implemented in forall param interface ...
+#if !defined(RAJA_COMPILER_MSVC)
+// dynamic & guided
+template <typename Policy, typename Iterable, typename Func>
+RAJA_INLINE void forall_impl(const Policy&, Iterable&& iter, Func&& loop_body)
+{
+  omp_sched_t prev_sched;
+  int prev_chunk;
+  omp_get_schedule(&prev_sched, &prev_chunk);
+  omp_set_schedule(Policy::schedule, Policy::chunk_size);
+  forall_impl(::RAJA::policy::omp::Runtime {}, std::forward<Iterable>(iter),
+              std::forward<Func>(loop_body));
+  omp_set_schedule(prev_sched, prev_chunk);
+}
+#endif
 
-  /// Tag dispatch for omp forall with nowait
 
-  //
-  // omp for nowait (Auto)
-  //
-  template <typename Iterable, typename Func>
-  RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto&,
-                               Iterable&& iter,
-                               Func&& loop_body)
-  {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for nowait
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
-  }
+/// Tag dispatch for omp forall with nowait
 
-  //
-  // omp for schedule(static) nowait
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for nowait (Auto)
+//
+template <typename Iterable, typename Func>
+RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto&,
+                                    Iterable&& iter,
+                                    Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for nowait
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(static) nowait
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(static, ChunkSize) nowait
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(static) nowait
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void
+forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>&,
+                   Iterable&& iter,
+                   Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(static) nowait
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(static, ChunkSize) nowait
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //TODO :: not implemented in param interface...
-  #if !defined(RAJA_COMPILER_MSVC)
-  // dynamic & guided
-  template <typename Policy, typename Iterable, typename Func>
-  RAJA_INLINE void forall_impl_nowait(const Policy&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(static, ChunkSize) nowait
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void
+forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>&,
+                   Iterable&& iter,
+                   Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(static, ChunkSize) nowait
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    omp_sched_t prev_sched;
-    int prev_chunk;
-    omp_get_schedule(&prev_sched, &prev_chunk);
-    omp_set_schedule(Policy::schedule, Policy::chunk_size);
-    forall_impl_nowait(::RAJA::policy::omp::Runtime{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
-    omp_set_schedule(prev_sched, prev_chunk);
+    loop_body(begin_it[i]);
   }
-  #endif
-
-} // end namespace internal
+}
 
-template <typename Schedule, typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+// TODO :: not implemented in param interface...
+#if !defined(RAJA_COMPILER_MSVC)
+// dynamic & guided
+template <typename Policy, typename Iterable, typename Func>
+RAJA_INLINE void
+forall_impl_nowait(const Policy&, Iterable&& iter, Func&& loop_body)
+{
+  omp_sched_t prev_sched;
+  int prev_chunk;
+  omp_get_schedule(&prev_sched, &prev_chunk);
+  omp_set_schedule(Policy::schedule, Policy::chunk_size);
+  forall_impl_nowait(::RAJA::policy::omp::Runtime {},
+                     std::forward<Iterable>(iter),
+                     std::forward<Func>(loop_body));
+  omp_set_schedule(prev_sched, prev_chunk);
+}
+#endif
+
+}  // end namespace internal
+
+template <typename Schedule,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Host host_res,
             const omp_for_schedule_exec<Schedule>&,
             Iterable&& iter,
             Func&& loop_body,
             ForallParam)
 {
-  internal::forall_impl(Schedule{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
+  internal::forall_impl(Schedule {}, std::forward<Iterable>(iter),
+                        std::forward<Func>(loop_body));
   return resources::EventProxy<resources::Host>(host_res);
 }
 
-template <typename Schedule, typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+template <typename Schedule,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Host host_res,
             const omp_for_nowait_schedule_exec<Schedule>&,
             Iterable&& iter,
             Func&& loop_body,
             ForallParam)
 {
-  internal::forall_impl_nowait(Schedule{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
+  internal::forall_impl_nowait(Schedule {}, std::forward<Iterable>(iter),
+                               std::forward<Func>(loop_body));
   return resources::EventProxy<resources::Host>(host_res);
 }
 
diff --git a/include/RAJA/policy/openmp/kernel/Collapse.hpp b/include/RAJA/policy/openmp/kernel/Collapse.hpp
index ba71ac2fbf..76e0ca3fbc 100644
--- a/include/RAJA/policy/openmp/kernel/Collapse.hpp
+++ b/include/RAJA/policy/openmp/kernel/Collapse.hpp
@@ -38,8 +38,8 @@ namespace RAJA
 struct omp_parallel_collapse_exec
     : make_policy_pattern_t<RAJA::Policy::openmp,
                             RAJA::Pattern::forall,
-                            RAJA::policy::omp::For> {
-};
+                            RAJA::policy::omp::For>
+{};
 
 namespace internal
 {
@@ -48,10 +48,15 @@ namespace internal
 // Collapsing two loops
 /////////
 
-template <camp::idx_t Arg0, camp::idx_t Arg1, typename... EnclosedStmts, typename Types>
+template <camp::idx_t Arg0,
+          camp::idx_t Arg1,
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1>,
-                                             EnclosedStmts...>, Types> {
+                                             EnclosedStmts...>,
+                         Types>
+{
 
 
   template <typename Data>
@@ -71,14 +76,17 @@ struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
 
     using RAJA::internal::thread_privatize;
     auto privatizer = thread_privatize(data);
-#pragma omp parallel for private(i0, i1) firstprivate(privatizer) \
+#pragma omp parallel for private(i0, i1) firstprivate(privatizer)              \
     RAJA_COLLAPSE(2)
-    for (i0 = 0; i0 < l0; ++i0) {
-      for (i1 = 0; i1 < l1; ++i1) {
+    for (i0 = 0; i0 < l0; ++i0)
+    {
+      for (i1 = 0; i1 < l1; ++i1)
+      {
         auto& private_data = privatizer.get_priv();
         private_data.template assign_offset<Arg0>(i0);
         private_data.template assign_offset<Arg1>(i1);
-        execute_statement_list<camp::list<EnclosedStmts...>, NewTypes1>(private_data);
+        execute_statement_list<camp::list<EnclosedStmts...>, NewTypes1>(
+            private_data);
       }
     }
   }
@@ -92,7 +100,9 @@ template <camp::idx_t Arg0,
           typename Types>
 struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1, Arg2>,
-                                             EnclosedStmts...>, Types> {
+                                             EnclosedStmts...>,
+                         Types>
+{
 
 
   template <typename Data>
@@ -101,9 +111,9 @@ struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
     const auto l0 = segment_length<Arg0>(data);
     const auto l1 = segment_length<Arg1>(data);
     const auto l2 = segment_length<Arg2>(data);
-    auto i0 = l0;
-    auto i1 = l1;
-    auto i2 = l2;
+    auto i0       = l0;
+    auto i1       = l1;
+    auto i2       = l2;
 
     // Set the argument types for this loop
     using NewTypes0 = setSegmentTypeFromData<Types, Arg0, Data>;
@@ -112,16 +122,20 @@ struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
 
     using RAJA::internal::thread_privatize;
     auto privatizer = thread_privatize(data);
-#pragma omp parallel for private(i0, i1, i2) firstprivate(privatizer) \
+#pragma omp parallel for private(i0, i1, i2) firstprivate(privatizer)          \
     RAJA_COLLAPSE(3)
-    for (i0 = 0; i0 < l0; ++i0) {
-      for (i1 = 0; i1 < l1; ++i1) {
-        for (i2 = 0; i2 < l2; ++i2) {
+    for (i0 = 0; i0 < l0; ++i0)
+    {
+      for (i1 = 0; i1 < l1; ++i1)
+      {
+        for (i2 = 0; i2 < l2; ++i2)
+        {
           auto& private_data = privatizer.get_priv();
           private_data.template assign_offset<Arg0>(i0);
           private_data.template assign_offset<Arg1>(i1);
           private_data.template assign_offset<Arg2>(i2);
-          execute_statement_list<camp::list<EnclosedStmts...>, NewTypes2>(private_data);
+          execute_statement_list<camp::list<EnclosedStmts...>, NewTypes2>(
+              private_data);
         }
       }
     }
@@ -129,9 +143,6 @@ struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
 };
 
 
-
-
-
 }  // namespace internal
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp b/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp
index 65f56010bc..be051f1209 100644
--- a/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp
+++ b/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp
@@ -30,38 +30,33 @@
 #include "RAJA/policy/openmp/policy.hpp"
 
 
-
 namespace RAJA
 {
 
 namespace statement
 {
-struct OmpSyncThreads : public internal::Statement<camp::nil> {
-};
+struct OmpSyncThreads : public internal::Statement<camp::nil>
+{};
 
-} // namespace statement
+}  // namespace statement
 
 namespace internal
 {
 
 
-
-//Statement executor to synchronize omp threads inside a kernel region
-template<typename Types>
-struct StatementExecutor<statement::OmpSyncThreads, Types> {
-
-template<typename Data>
-static RAJA_INLINE void exec(Data &&)
+// Statement executor to synchronize omp threads inside a kernel region
+template <typename Types>
+struct StatementExecutor<statement::OmpSyncThreads, Types>
 {
-  #pragma omp barrier
-}
 
+  template <typename Data>
+  static RAJA_INLINE void exec(Data&&)
+  {
+#pragma omp barrier
+  }
 };
 
 
-
-
-
 }  // namespace internal
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/openmp/launch.hpp b/include/RAJA/policy/openmp/launch.hpp
index 7856bd6fda..2beb61ceba 100644
--- a/include/RAJA/policy/openmp/launch.hpp
+++ b/include/RAJA/policy/openmp/launch.hpp
@@ -25,48 +25,60 @@ namespace RAJA
 {
 
 template <>
-struct LaunchExecute<RAJA::omp_launch_t> {
+struct LaunchExecute<RAJA::omp_launch_t>
+{
 
   template <typename BODY, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, LaunchParams const &params, const char *, BODY const &body, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       LaunchParams const& params,
+       const char*,
+       BODY const& body,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-
-        LaunchContext ctx;
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          LaunchContext ctx;
 
-        using RAJA::internal::thread_privatize;
-        auto loop_body = thread_privatize(body);
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 
-        ctx.shared_mem_ptr = (char*) malloc(params.shared_mem_size);
+          ctx.shared_mem_ptr = (char*)malloc(params.shared_mem_size);
 
-        loop_body.get_priv()(ctx);
+          loop_body.get_priv()(ctx);
 
-        free(ctx.shared_mem_ptr);
-        ctx.shared_mem_ptr = nullptr;
-    });
+          free(ctx.shared_mem_ptr);
+          ctx.shared_mem_ptr = nullptr;
+        });
 
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  template<typename ReduceParams, typename BODY>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, LaunchParams const &launch_params,
-       const char *RAJA_UNUSED_ARG(kernel_name),  BODY const &body, ReduceParams &f_params)
+  template <typename ReduceParams, typename BODY>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       LaunchParams const& launch_params,
+       const char* RAJA_UNUSED_ARG(kernel_name),
+       BODY const& body,
+       ReduceParams& f_params)
   {
 
     using EXEC_POL = RAJA::omp_launch_t;
 
     expt::ParamMultiplexer::init<EXEC_POL>(f_params);
 
-    //reducer object must be named f_params as expected by macro below
+    // reducer object must be named f_params as expected by macro below
     RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-   #pragma omp parallel reduction(combine : f_params)
+#pragma omp parallel reduction(combine : f_params)
     {
 
       LaunchContext ctx;
@@ -74,7 +86,7 @@ struct LaunchExecute<RAJA::omp_launch_t> {
       using RAJA::internal::thread_privatize;
       auto loop_body = thread_privatize(body);
 
-      ctx.shared_mem_ptr = (char*) malloc(launch_params.shared_mem_size);
+      ctx.shared_mem_ptr = (char*)malloc(launch_params.shared_mem_size);
 
       expt::invoke_body(f_params, loop_body.get_priv(), ctx);
 
@@ -86,120 +98,136 @@ struct LaunchExecute<RAJA::omp_launch_t> {
 
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
 
 template <typename SEGMENT>
-struct LoopExecute<omp_parallel_for_exec, SEGMENT> {
+struct LoopExecute<omp_parallel_for_exec, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     int len = segment.end() - segment.begin();
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 #pragma omp for
-      for (int i = 0; i < len; i++) {
+          for (int i = 0; i < len; i++)
+          {
 
-        loop_body.get_priv()(*(segment.begin() + i));
-      }
-    });
+            loop_body.get_priv()(*(segment.begin() + i));
+          }
+        });
   }
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 
 #pragma omp for
-      for (int j = 0; j < len1; j++) {
-        for (int i = 0; i < len0; i++) {
-
-          loop_body.get_priv()(*(segment0.begin() + i),
-                               *(segment1.begin() + j));
-        }
-      }
-    });
+          for (int j = 0; j < len1; j++)
+          {
+            for (int i = 0; i < len0; i++)
+            {
+
+              loop_body.get_priv()(*(segment0.begin() + i),
+                                   *(segment1.begin() + j));
+            }
+          }
+        });
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 
 #pragma omp for
-      for (int k = 0; k < len2; k++) {
-        for (int j = 0; j < len1; j++) {
-          for (int i = 0; i < len0; i++) {
-            loop_body.get_priv()(*(segment0.begin() + i),
-                                 *(segment1.begin() + j),
-                                 *(segment2.begin() + k));
+          for (int k = 0; k < len2; k++)
+          {
+            for (int j = 0; j < len1; j++)
+            {
+              for (int i = 0; i < len0; i++)
+              {
+                loop_body.get_priv()(*(segment0.begin() + i),
+                                     *(segment1.begin() + j),
+                                     *(segment2.begin() + k));
+              }
+            }
           }
-        }
-      }
-    });
+        });
   }
 };
 
 template <typename SEGMENT>
-struct LoopExecute<omp_for_exec, SEGMENT> {
+struct LoopExecute<omp_for_exec, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     int len = segment.end() - segment.begin();
 #pragma omp for
-    for (int i = 0; i < len; i++) {
+    for (int i = 0; i < len; i++)
+    {
 
       body(*(segment.begin() + i));
     }
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
 #pragma omp for
-    for (int j = 0; j < len1; j++) {
-      for (int i = 0; i < len0; i++) {
+    for (int j = 0; j < len1; j++)
+    {
+      for (int i = 0; i < len0; i++)
+      {
 
         body(*(segment0.begin() + i), *(segment1.begin() + j));
       }
@@ -207,12 +235,12 @@ struct LoopExecute<omp_for_exec, SEGMENT> {
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
@@ -220,11 +248,13 @@ struct LoopExecute<omp_for_exec, SEGMENT> {
     const int len0 = segment0.end() - segment0.begin();
 
 #pragma omp for
-    for (int k = 0; k < len2; k++) {
-      for (int j = 0; j < len1; j++) {
-        for (int i = 0; i < len0; i++) {
-          body(*(segment0.begin() + i),
-               *(segment1.begin() + j),
+    for (int k = 0; k < len2; k++)
+    {
+      for (int j = 0; j < len1; j++)
+      {
+        for (int i = 0; i < len0; i++)
+        {
+          body(*(segment0.begin() + i), *(segment1.begin() + j),
                *(segment2.begin() + k));
         }
       }
@@ -236,53 +266,54 @@ struct LoopExecute<omp_for_exec, SEGMENT> {
 // Return local index
 //
 template <typename SEGMENT>
-struct LoopICountExecute<omp_for_exec, SEGMENT> {
+struct LoopICountExecute<omp_for_exec, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     int len = segment.end() - segment.begin();
 
 #pragma omp for
-      for (int i = 0; i < len; i++) {
-        body(*(segment.begin() + i), i);
-      }
+    for (int i = 0; i < len; i++)
+    {
+      body(*(segment.begin() + i), i);
+    }
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
 #pragma omp for
-      for (int j = 0; j < len1; j++) {
-        for (int i = 0; i < len0; i++) {
+    for (int j = 0; j < len1; j++)
+    {
+      for (int i = 0; i < len0; i++)
+      {
 
-          body(*(segment0.begin() + i),
-               *(segment1.begin() + j),
-               i,
-               j);
-        }
+        body(*(segment0.begin() + i), *(segment1.begin() + j), i, j);
       }
+    }
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
@@ -290,18 +321,17 @@ struct LoopICountExecute<omp_for_exec, SEGMENT> {
     const int len0 = segment0.end() - segment0.begin();
 
 #pragma omp for
-      for (int k = 0; k < len2; k++) {
-        for (int j = 0; j < len1; j++) {
-          for (int i = 0; i < len0; i++) {
-            body(*(segment0.begin() + i),
-                 *(segment1.begin() + j),
-                 *(segment2.begin() + k),
-                 i,
-                 j,
-                 k);
-          }
+    for (int k = 0; k < len2; k++)
+    {
+      for (int j = 0; j < len1; j++)
+      {
+        for (int i = 0; i < len0; i++)
+        {
+          body(*(segment0.begin() + i), *(segment1.begin() + j),
+               *(segment2.begin() + k), i, j, k);
         }
       }
+    }
   }
 };
 
@@ -309,219 +339,246 @@ struct LoopICountExecute<omp_for_exec, SEGMENT> {
 struct omp_parallel_nested_for_exec;
 
 template <typename SEGMENT>
-struct LoopExecute<omp_parallel_nested_for_exec, SEGMENT> {
+struct LoopExecute<omp_parallel_nested_for_exec, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 
 #pragma omp for RAJA_COLLAPSE(2)
-      for (int j = 0; j < len1; j++) {
-        for (int i = 0; i < len0; i++) {
-
-          loop_body.get_priv()(*(segment0.begin() + i),
-                               *(segment1.begin() + j));
-        }
-      }
-    });
+          for (int j = 0; j < len1; j++)
+          {
+            for (int i = 0; i < len0; i++)
+            {
+
+              loop_body.get_priv()(*(segment0.begin() + i),
+                                   *(segment1.begin() + j));
+            }
+          }
+        });
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 
 #pragma omp for RAJA_COLLAPSE(3)
-      for (int k = 0; k < len2; k++) {
-        for (int j = 0; j < len1; j++) {
-          for (int i = 0; i < len0; i++) {
-            loop_body.get_priv()(*(segment0.begin() + i),
-                                 *(segment1.begin() + j),
-                                 *(segment2.begin() + k));
+          for (int k = 0; k < len2; k++)
+          {
+            for (int j = 0; j < len1; j++)
+            {
+              for (int i = 0; i < len0; i++)
+              {
+                loop_body.get_priv()(*(segment0.begin() + i),
+                                     *(segment1.begin() + j),
+                                     *(segment2.begin() + k));
+              }
+            }
           }
-        }
-      }
-    });
+        });
   }
 };
 
 // Return local index
 template <typename SEGMENT>
-struct LoopICountExecute<omp_parallel_nested_for_exec, SEGMENT> {
+struct LoopICountExecute<omp_parallel_nested_for_exec, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 
 #pragma omp for RAJA_COLLAPSE(2)
-      for (int j = 0; j < len1; j++) {
-        for (int i = 0; i < len0; i++) {
-
-          loop_body.get_priv()(*(segment0.begin() + i),
-                               *(segment1.begin() + j),
-                               i,
-                               j);
-        }
-      }
-    });
+          for (int j = 0; j < len1; j++)
+          {
+            for (int i = 0; i < len0; i++)
+            {
+
+              loop_body.get_priv()(*(segment0.begin() + i),
+                                   *(segment1.begin() + j), i, j);
+            }
+          }
+        });
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 
 #pragma omp for RAJA_COLLAPSE(3)
-      for (int k = 0; k < len2; k++) {
-        for (int j = 0; j < len1; j++) {
-          for (int i = 0; i < len0; i++) {
-            loop_body.get_priv()(*(segment0.begin() + i),
-                                 *(segment1.begin() + j),
-                                 *(segment2.begin() + k),
-                                 i,
-                                 j,
-                                 k);
+          for (int k = 0; k < len2; k++)
+          {
+            for (int j = 0; j < len1; j++)
+            {
+              for (int i = 0; i < len0; i++)
+              {
+                loop_body.get_priv()(*(segment0.begin() + i),
+                                     *(segment1.begin() + j),
+                                     *(segment2.begin() + k), i, j, k);
+              }
+            }
           }
-        }
-      }
-    });
+        });
   }
 };
 
 
 template <typename SEGMENT>
-struct TileExecute<omp_parallel_for_exec, SEGMENT> {
+struct TileExecute<omp_parallel_for_exec, SEGMENT>
+{
 
   template <typename BODY, typename TILE_T>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     int len = segment.end() - segment.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 
 #pragma omp for
-      for (int i = 0; i < len; i += tile_size) {
-        loop_body.get_priv()(segment.slice(i, tile_size));
-      }
-    });
+          for (int i = 0; i < len; i += tile_size)
+          {
+            loop_body.get_priv()(segment.slice(i, tile_size));
+          }
+        });
   }
 };
 
 template <typename SEGMENT>
-struct TileTCountExecute<omp_parallel_for_exec, SEGMENT> {
+struct TileTCountExecute<omp_parallel_for_exec, SEGMENT>
+{
 
   template <typename BODY, typename TILE_T>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
-    const int len = segment.end() - segment.begin();
+    const int len      = segment.end() - segment.begin();
     const int numTiles = (len - 1) / tile_size + 1;
 
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 
 #pragma omp parallel for
-      for (int i = 0; i < numTiles; i++) {
-        const int i_tile_size = i * tile_size;
-        loop_body.get_priv()(segment.slice(i_tile_size, tile_size), i);
-      }
-    });
+          for (int i = 0; i < numTiles; i++)
+          {
+            const int i_tile_size = i * tile_size;
+            loop_body.get_priv()(segment.slice(i_tile_size, tile_size), i);
+          }
+        });
   }
 };
 
 template <typename SEGMENT>
-struct TileExecute<omp_for_exec, SEGMENT> {
+struct TileExecute<omp_for_exec, SEGMENT>
+{
 
   template <typename BODY, typename TILE_T>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     int len = segment.end() - segment.begin();
 #pragma omp for
-    for (int i = 0; i < len; i += tile_size) {
+    for (int i = 0; i < len; i += tile_size)
+    {
       body(segment.slice(i, tile_size));
     }
   }
 };
 
 template <typename SEGMENT>
-struct TileTCountExecute<omp_for_exec, SEGMENT> {
+struct TileTCountExecute<omp_for_exec, SEGMENT>
+{
 
   template <typename BODY, typename TILE_T>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
-    const int len = segment.end() - segment.begin();
+    const int len      = segment.end() - segment.begin();
     const int numTiles = (len - 1) / tile_size + 1;
 
 #pragma omp for
-    for (int i = 0; i < numTiles; i++) {
+    for (int i = 0; i < numTiles; i++)
+    {
       const int i_tile_size = i * tile_size;
       body(segment.slice(i_tile_size, tile_size), i);
     }
diff --git a/include/RAJA/policy/openmp/multi_reduce.hpp b/include/RAJA/policy/openmp/multi_reduce.hpp
index 22b09a7722..9aa61217b3 100644
--- a/include/RAJA/policy/openmp/multi_reduce.hpp
+++ b/include/RAJA/policy/openmp/multi_reduce.hpp
@@ -56,7 +56,7 @@ namespace detail
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp, typename tuning >
+template <typename T, typename t_MultiReduceOp, typename tuning>
 struct MultiReduceDataOMP;
 
 /*!
@@ -68,47 +68,56 @@ struct MultiReduceDataOMP;
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp >
-struct MultiReduceDataOMP<T, t_MultiReduceOp,
-    RAJA::omp::MultiReduceTuning<RAJA::omp::multi_reduce_algorithm::combine_on_destruction>>
+template <typename T, typename t_MultiReduceOp>
+struct MultiReduceDataOMP<
+    T,
+    t_MultiReduceOp,
+    RAJA::omp::MultiReduceTuning<
+        RAJA::omp::multi_reduce_algorithm::combine_on_destruction>>
 {
-  using value_type = T;
+  using value_type    = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataOMP() = delete;
 
-  template < typename Container,
-             std::enable_if_t<!std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr >
+  template <typename Container,
+            std::enable_if_t<
+                !std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr>
   MultiReduceDataOMP(Container const& container, T identity)
-      : m_parent(nullptr)
-      , m_num_bins(container.size())
-      , m_identity(identity)
-      , m_data(nullptr)
+      : m_parent(nullptr),
+        m_num_bins(container.size()),
+        m_identity(identity),
+        m_data(nullptr)
   {
     m_data = create_data(container, m_num_bins);
   }
 
-  MultiReduceDataOMP(MultiReduceDataOMP const &other)
-      : m_parent(other.m_parent ? other.m_parent : &other)
-      , m_num_bins(other.m_num_bins)
-      , m_identity(other.m_identity)
-      , m_data(nullptr)
+  MultiReduceDataOMP(MultiReduceDataOMP const& other)
+      : m_parent(other.m_parent ? other.m_parent : &other),
+        m_num_bins(other.m_num_bins),
+        m_identity(other.m_identity),
+        m_data(nullptr)
   {
-    m_data = create_data(RepeatView<value_type>(other.m_identity, other.m_num_bins), other.m_num_bins);
+    m_data =
+        create_data(RepeatView<value_type>(other.m_identity, other.m_num_bins),
+                    other.m_num_bins);
   }
 
-  MultiReduceDataOMP(MultiReduceDataOMP &&) = delete;
+  MultiReduceDataOMP(MultiReduceDataOMP&&)                 = delete;
   MultiReduceDataOMP& operator=(MultiReduceDataOMP const&) = delete;
-  MultiReduceDataOMP& operator=(MultiReduceDataOMP &&) = delete;
+  MultiReduceDataOMP& operator=(MultiReduceDataOMP&&)      = delete;
 
   ~MultiReduceDataOMP()
   {
-    if (m_data) {
-      if (m_parent && (m_num_bins != size_t(0))) {
+    if (m_data)
+    {
+      if (m_parent && (m_num_bins != size_t(0)))
+      {
 #pragma omp critical(ompMultiReduceCritical)
         {
-          for (size_t bin = 0; bin < m_num_bins; ++bin) {
-            MultiReduceOp{}(m_parent->m_data[bin], m_data[bin]);
+          for (size_t bin = 0; bin < m_num_bins; ++bin)
+          {
+            MultiReduceOp {}(m_parent->m_data[bin], m_data[bin]);
           }
         }
       }
@@ -116,18 +125,22 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
     }
   }
 
-  template < typename Container >
+  template <typename Container>
   void reset(Container const& container, T identity)
   {
-    m_identity = identity;
+    m_identity          = identity;
     size_t new_num_bins = container.size();
-    if (new_num_bins != m_num_bins) {
+    if (new_num_bins != m_num_bins)
+    {
       destroy_data(m_data, m_num_bins);
       m_num_bins = new_num_bins;
-      m_data = create_data(container, m_num_bins);
-    } else {
+      m_data     = create_data(container, m_num_bins);
+    }
+    else
+    {
       size_t bin = 0;
-      for (auto const& value : container) {
+      for (auto const& value : container)
+      {
         m_data[bin] = value;
         ++bin;
       }
@@ -138,26 +151,29 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
 
   T identity() const { return m_identity; }
 
-  void combine(size_t bin, T const &val) { MultiReduceOp{}(m_data[bin], val); }
+  void combine(size_t bin, T const& val) { MultiReduceOp {}(m_data[bin], val); }
 
   T get(size_t bin) const { return m_data[bin]; }
 
 private:
-  MultiReduceDataOMP const *m_parent;
+  MultiReduceDataOMP const* m_parent;
   size_t m_num_bins;
   T m_identity;
   T* m_data;
 
-  template < typename Container >
+  template <typename Container>
   static T* create_data(Container const& container, size_t num_bins)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return nullptr;
     }
-    auto data = RAJA::allocate_aligned_type<T>( RAJA::DATA_ALIGN, num_bins * sizeof(T) );
+    auto data =
+        RAJA::allocate_aligned_type<T>(RAJA::DATA_ALIGN, num_bins * sizeof(T));
     size_t bin = 0;
-    for (auto const& value : container) {
-      new(&data[bin]) T(value);
+    for (auto const& value : container)
+    {
+      new (&data[bin]) T(value);
       ++bin;
     }
     return data;
@@ -165,11 +181,13 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
 
   static void destroy_data(T*& data, size_t num_bins)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return;
     }
-    for (size_t bin = num_bins; bin > 0; --bin) {
-      data[bin-1].~T();
+    for (size_t bin = num_bins; bin > 0; --bin)
+    {
+      data[bin - 1].~T();
     }
     RAJA::free_aligned(data);
     data = nullptr;
@@ -185,74 +203,93 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp >
-struct MultiReduceDataOMP<T, t_MultiReduceOp,
-    RAJA::omp::MultiReduceTuning<RAJA::omp::multi_reduce_algorithm::combine_on_get>>
+template <typename T, typename t_MultiReduceOp>
+struct MultiReduceDataOMP<
+    T,
+    t_MultiReduceOp,
+    RAJA::omp::MultiReduceTuning<
+        RAJA::omp::multi_reduce_algorithm::combine_on_get>>
 {
-  using value_type = T;
+  using value_type    = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataOMP() = delete;
 
-  template < typename Container,
-             std::enable_if_t<!std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr >
+  template <typename Container,
+            std::enable_if_t<
+                !std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr>
   MultiReduceDataOMP(Container const& container, T identity)
-      : m_parent(nullptr)
-      , m_max_threads(omp_get_max_threads())
-      , m_num_bins(container.size())
-      , m_padded_threads(pad_threads(m_max_threads))
-      , m_padded_bins(pad_bins(m_num_bins))
-      , m_identity(identity)
-      , m_data(nullptr)
+      : m_parent(nullptr),
+        m_max_threads(omp_get_max_threads()),
+        m_num_bins(container.size()),
+        m_padded_threads(pad_threads(m_max_threads)),
+        m_padded_bins(pad_bins(m_num_bins)),
+        m_identity(identity),
+        m_data(nullptr)
   {
-    m_data = create_data(container, identity, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
+    m_data = create_data(container, identity, m_num_bins, m_max_threads,
+                         m_padded_bins, m_padded_threads);
   }
 
-  MultiReduceDataOMP(MultiReduceDataOMP const &other)
-      : m_parent(other.m_parent ? other.m_parent : &other)
-      , m_num_bins(other.m_num_bins)
-      , m_padded_threads(other.m_padded_threads)
-      , m_padded_bins(other.m_padded_bins)
-      , m_identity(other.m_identity)
-      , m_data(other.m_data)
-  { }
+  MultiReduceDataOMP(MultiReduceDataOMP const& other)
+      : m_parent(other.m_parent ? other.m_parent : &other),
+        m_num_bins(other.m_num_bins),
+        m_padded_threads(other.m_padded_threads),
+        m_padded_bins(other.m_padded_bins),
+        m_identity(other.m_identity),
+        m_data(other.m_data)
+  {}
 
-  MultiReduceDataOMP(MultiReduceDataOMP &&) = delete;
+  MultiReduceDataOMP(MultiReduceDataOMP&&)                 = delete;
   MultiReduceDataOMP& operator=(MultiReduceDataOMP const&) = delete;
-  MultiReduceDataOMP& operator=(MultiReduceDataOMP &&) = delete;
+  MultiReduceDataOMP& operator=(MultiReduceDataOMP&&)      = delete;
 
   ~MultiReduceDataOMP()
   {
-    if (m_data) {
-      if (!m_parent) {
-        destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
+    if (m_data)
+    {
+      if (!m_parent)
+      {
+        destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins,
+                     m_padded_threads);
       }
     }
   }
 
-  template < typename Container >
+  template <typename Container>
   void reset(Container const& container, T identity)
   {
-    m_identity = identity;
+    m_identity          = identity;
     size_t new_num_bins = container.size();
-    if (new_num_bins != m_num_bins) {
-      destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
-      m_num_bins = new_num_bins;
+    if (new_num_bins != m_num_bins)
+    {
+      destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins,
+                   m_padded_threads);
+      m_num_bins    = new_num_bins;
       m_padded_bins = pad_bins(m_num_bins);
-      m_data = create_data(container, identity, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
-    } else {
-      if (m_max_threads > 0) {
+      m_data = create_data(container, identity, m_num_bins, m_max_threads,
+                           m_padded_bins, m_padded_threads);
+    }
+    else
+    {
+      if (m_max_threads > 0)
+      {
         {
           size_t thread_idx = 0;
-          size_t bin = 0;
-          for (auto const& value : container) {
-            m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)] = value;
+          size_t bin        = 0;
+          for (auto const& value : container)
+          {
+            m_data[index_data(bin, thread_idx, m_padded_bins,
+                              m_padded_threads)] = value;
             ++bin;
           }
         }
-        for (size_t thread_idx = 1; thread_idx < m_max_threads; ++thread_idx) {
-          for (size_t bin = 0; bin < m_num_bins; ++bin) {
-            m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)] = identity;
+        for (size_t thread_idx = 1; thread_idx < m_max_threads; ++thread_idx)
+        {
+          for (size_t bin = 0; bin < m_num_bins; ++bin)
+          {
+            m_data[index_data(bin, thread_idx, m_padded_bins,
+                              m_padded_threads)] = identity;
           }
         }
       }
@@ -263,24 +300,28 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
 
   T identity() const { return m_identity; }
 
-  void combine(size_t bin, T const &val)
+  void combine(size_t bin, T const& val)
   {
     size_t thread_idx = omp_get_thread_num();
-    MultiReduceOp{}(m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)], val);
+    MultiReduceOp {}(
+        m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)],
+        val);
   }
 
   T get(size_t bin) const
   {
     ::RAJA::detail::HighAccuracyReduce<T, typename MultiReduceOp::operator_type>
         reducer(m_identity);
-    for (size_t thread_idx = 0; thread_idx < m_max_threads; ++thread_idx) {
-      reducer.combine(m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)]);
+    for (size_t thread_idx = 0; thread_idx < m_max_threads; ++thread_idx)
+    {
+      reducer.combine(
+          m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)]);
     }
     return reducer.get_and_clear();
   }
 
 private:
-  MultiReduceDataOMP const *m_parent;
+  MultiReduceDataOMP const* m_parent;
   size_t m_max_threads;
   size_t m_num_bins;
   size_t m_padded_threads;
@@ -290,8 +331,10 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
 
   static constexpr size_t pad_bins(size_t num_bins)
   {
-    size_t num_cache_lines = RAJA_DIVIDE_CEILING_INT(num_bins*sizeof(T), RAJA::DATA_ALIGN);
-    return RAJA_DIVIDE_CEILING_INT(num_cache_lines * RAJA::DATA_ALIGN, sizeof(T));
+    size_t num_cache_lines =
+        RAJA_DIVIDE_CEILING_INT(num_bins * sizeof(T), RAJA::DATA_ALIGN);
+    return RAJA_DIVIDE_CEILING_INT(num_cache_lines * RAJA::DATA_ALIGN,
+                                   sizeof(T));
   }
 
   static constexpr size_t pad_threads(size_t max_threads)
@@ -299,33 +342,46 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
     return max_threads;
   }
 
-  static constexpr size_t index_data(size_t bin, size_t thread_idx,
-                                     size_t padded_bins, size_t RAJA_UNUSED_ARG(padded_threads))
+  static constexpr size_t index_data(size_t bin,
+                                     size_t thread_idx,
+                                     size_t padded_bins,
+                                     size_t RAJA_UNUSED_ARG(padded_threads))
   {
     return bin + thread_idx * padded_bins;
   }
 
-  template < typename Container >
-  static T* create_data(Container const& container, T identity,
-                        size_t num_bins, size_t max_threads,
-                        size_t padded_bins, size_t padded_threads)
+  template <typename Container>
+  static T* create_data(Container const& container,
+                        T identity,
+                        size_t num_bins,
+                        size_t max_threads,
+                        size_t padded_bins,
+                        size_t padded_threads)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return nullptr;
     }
-    auto data = RAJA::allocate_aligned_type<T>( RAJA::DATA_ALIGN, padded_threads*padded_bins*sizeof(T) );
-    if (max_threads > 0) {
+    auto data = RAJA::allocate_aligned_type<T>(
+        RAJA::DATA_ALIGN, padded_threads * padded_bins * sizeof(T));
+    if (max_threads > 0)
+    {
       {
         size_t thread_idx = 0;
-        size_t bin = 0;
-        for (auto const& value : container) {
-          new(&data[index_data(bin, thread_idx, padded_bins, padded_threads)]) T(value);
+        size_t bin        = 0;
+        for (auto const& value : container)
+        {
+          new (&data[index_data(bin, thread_idx, padded_bins, padded_threads)])
+              T(value);
           ++bin;
         }
       }
-      for (size_t thread_idx = 1; thread_idx < max_threads; ++thread_idx) {
-        for (size_t bin = 0; bin < num_bins; ++bin) {
-          new(&data[index_data(bin, thread_idx, padded_bins, padded_threads)]) T(identity);
+      for (size_t thread_idx = 1; thread_idx < max_threads; ++thread_idx)
+      {
+        for (size_t bin = 0; bin < num_bins; ++bin)
+        {
+          new (&data[index_data(bin, thread_idx, padded_bins, padded_threads)])
+              T(identity);
         }
       }
     }
@@ -333,15 +389,21 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
   }
 
   static void destroy_data(T*& data,
-                           size_t num_bins, size_t max_threads,
-                           size_t padded_bins, size_t padded_threads)
+                           size_t num_bins,
+                           size_t max_threads,
+                           size_t padded_bins,
+                           size_t padded_threads)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return;
     }
-    for (size_t thread_idx = max_threads; thread_idx > 0; --thread_idx) {
-      for (size_t bin = num_bins; bin > 0; --bin) {
-        data[index_data(bin-1, thread_idx-1, padded_bins, padded_threads)].~T();
+    for (size_t thread_idx = max_threads; thread_idx > 0; --thread_idx)
+    {
+      for (size_t bin = num_bins; bin > 0; --bin)
+      {
+        data[index_data(bin - 1, thread_idx - 1, padded_bins, padded_threads)]
+            .~T();
       }
     }
     RAJA::free_aligned(data);
@@ -351,7 +413,8 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
 
 }  // namespace detail
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::omp::omp_multi_reduce_policy, detail::MultiReduceDataOMP)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::omp::omp_multi_reduce_policy,
+                                detail::MultiReduceDataOMP)
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/openmp/params/forall.hpp b/include/RAJA/policy/openmp/params/forall.hpp
index d9bea5d0d8..1f8c2a5e95 100644
--- a/include/RAJA/policy/openmp/params/forall.hpp
+++ b/include/RAJA/policy/openmp/params/forall.hpp
@@ -18,303 +18,356 @@ namespace omp
 namespace expt
 {
 
-  namespace internal
-  {
-    //
-    // omp for (Auto)
-    //
-    template <typename ExecPol, typename Iterable, typename Func, typename ForallParam>
-    RAJA_INLINE 
-    concepts::enable_if< std::is_same<ExecPol, RAJA::policy::omp::Auto> >
-    forall_impl(const ExecPol& p,
-                Iterable&& iter,
-                Func&& loop_body,
-                ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
-
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+namespace internal
+{
+//
+// omp for (Auto)
+//
+template <typename ExecPol,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if<std::is_same<ExecPol, RAJA::policy::omp::Auto>>
+forall_impl(const ExecPol& p,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-    //
-    // omp for schedule(static)
-    //
-    template <template<int> class ExecPol, typename Iterable, typename Func, int ChunkSize, typename ForallParam>
-    RAJA_INLINE 
-    concepts::enable_if< std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
-                         std::integral_constant<bool,(ChunkSize <= 0)> >
-    forall_impl(const ExecPol<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(static) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+//
+// omp for schedule(static)
+//
+template <template <int> class ExecPol,
+          typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if<
+    std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
+    std::integral_constant<bool, (ChunkSize <= 0)>>
+forall_impl(const ExecPol<ChunkSize>& p,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(static) reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-    //
-    // omp for schedule(static, ChunkSize)
-    //
-    template <template<int> class ExecPol, typename Iterable, typename Func, int ChunkSize, typename ForallParam>
-    RAJA_INLINE 
-    concepts::enable_if< std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
-                         std::integral_constant<bool,(ChunkSize > 0)> >
-    forall_impl(const ExecPol<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(static, ChunkSize) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+//
+// omp for schedule(static, ChunkSize)
+//
+template <template <int> class ExecPol,
+          typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if<
+    std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
+    std::integral_constant<bool, (ChunkSize > 0)>>
+forall_impl(const ExecPol<ChunkSize>& p,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(static, ChunkSize)                           \
+    reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-    //
-    // omp for schedule(runtime)
-    //
-    template <typename Iterable, typename Func, typename ForallParam>
-    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+//
+// omp for schedule(runtime)
+//
+template <typename Iterable, typename Func, typename ForallParam>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(runtime) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(runtime) reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-    //
-    // omp for nowait (Auto)
-    //
-    template <typename Iterable, typename Func, typename ForallParam>
-    RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+//
+// omp for nowait (Auto)
+//
+template <typename Iterable, typename Func, typename ForallParam>
+RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto& p,
+                                    Iterable&& iter,
+                                    Func&& loop_body,
+                                    ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-      RAJA_EXTRACT_BED_IT(iter);
+  RAJA_EXTRACT_BED_IT(iter);
 #pragma omp parallel
-      {
-      #pragma omp for nowait reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
-      }
-
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
-
-    //
-    // omp for schedule(dynamic)
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
+  {
+#pragma omp for nowait reduction(combine : f_params)
+    for (decltype(distance_it) i = 0; i < distance_it; ++i)
     {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+      RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+    }
+  }
 
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(dynamic) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+//
+// omp for schedule(dynamic)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-    //
-    // omp for schedule(dynamic, ChunkSize)
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(dynamic) reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(dynamic, ChunkSize) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+//
+// omp for schedule(dynamic, ChunkSize)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(dynamic, ChunkSize)                          \
+    reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-    //
-    // omp for schedule(guided)
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(guided) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+//
+// omp for schedule(guided)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(guided) reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-    //
-    // omp for schedule(guided, ChunkSize)
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(guided, ChunkSize) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+//
+// omp for schedule(guided, ChunkSize)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(guided, ChunkSize)                           \
+    reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-    //
-    // omp for schedule(static) nowait
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+//
+// omp for schedule(static) nowait
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void
+forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
+                   Iterable&& iter,
+                   Func&& loop_body,
+                   ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-      RAJA_EXTRACT_BED_IT(iter);
+  RAJA_EXTRACT_BED_IT(iter);
 #pragma omp parallel
-      {
-      #pragma omp for schedule(static) nowait reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
-      }
-
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+  {
+#pragma omp for schedule(static) nowait reduction(combine : f_params)
+    for (decltype(distance_it) i = 0; i < distance_it; ++i)
+    {
+      RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
     }
+  }
 
-    //
-    // omp for schedule(static, ChunkSize) nowait
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-      RAJA_EXTRACT_BED_IT(iter);
+//
+// omp for schedule(static, ChunkSize) nowait
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void
+forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
+                   Iterable&& iter,
+                   Func&& loop_body,
+                   ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+  RAJA_EXTRACT_BED_IT(iter);
 #pragma omp parallel
-      {
-      #pragma omp for schedule(static, ChunkSize) nowait reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
-      }
-
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+  {
+#pragma omp for schedule(static, ChunkSize) nowait reduction(combine : f_params)
+    for (decltype(distance_it) i = 0; i < distance_it; ++i)
+    {
+      RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
     }
+  }
 
-  } //  namespace internal
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-  template <typename Schedule, typename Iterable, typename Func, typename ForallParam>
-  RAJA_INLINE resources::EventProxy<resources::Host> forall_impl(resources::Host host_res,
-                                                                 const omp_for_schedule_exec<Schedule>&,
-                                                                 Iterable&& iter,
-                                                                 Func&& loop_body,
-                                                                 ForallParam f_params)
-  {
-    expt::internal::forall_impl(Schedule{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body), std::forward<ForallParam>(f_params));
-    return resources::EventProxy<resources::Host>(host_res);
-  }
-} //  namespace expt
+}  //  namespace internal
+
+template <typename Schedule,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
+RAJA_INLINE resources::EventProxy<resources::Host>
+forall_impl(resources::Host host_res,
+            const omp_for_schedule_exec<Schedule>&,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam f_params)
+{
+  expt::internal::forall_impl(Schedule {}, std::forward<Iterable>(iter),
+                              std::forward<Func>(loop_body),
+                              std::forward<ForallParam>(f_params));
+  return resources::EventProxy<resources::Host>(host_res);
+}
+}  //  namespace expt
 
 ///
 /// OpenMP parallel policy implementation
 ///
-template <typename Iterable, typename Func, typename InnerPolicy, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+template <typename Iterable,
+          typename Func,
+          typename InnerPolicy,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(resources::Host host_res,
             const omp_parallel_exec<InnerPolicy>&,
             Iterable&& iter,
             Func&& loop_body,
             ForallParam f_params)
 {
-  expt::forall_impl(host_res, InnerPolicy{}, iter, loop_body, f_params);
+  expt::forall_impl(host_res, InnerPolicy {}, iter, loop_body, f_params);
   return resources::EventProxy<resources::Host>(host_res);
 }
 
diff --git a/include/RAJA/policy/openmp/params/kernel_name.hpp b/include/RAJA/policy/openmp/params/kernel_name.hpp
index 65a5f7a329..3a6c6d9bea 100644
--- a/include/RAJA/policy/openmp/params/kernel_name.hpp
+++ b/include/RAJA/policy/openmp/params/kernel_name.hpp
@@ -3,38 +3,42 @@
 
 #include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  init(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename T>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  combine(KernelName&, T& /*place holder argument*/) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  resolve(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
+// Init
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>>
+init(KernelName&)
+{
+  // TODO: Define kernel naming
+}
+
+// Combine
+template <typename EXEC_POL, typename T>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>>
+combine(KernelName&, T& /*place holder argument*/)
+{}
+
+// Resolve
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>>
+resolve(KernelName&)
+{
+  // TODO: Define kernel naming
+}
 
 #endif
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 
-#endif //  NEW_REDUCE_SEQ_REDUCE_HPP
+#endif  //  NEW_REDUCE_SEQ_REDUCE_HPP
diff --git a/include/RAJA/policy/openmp/params/reduce.hpp b/include/RAJA/policy/openmp/params/reduce.hpp
index f71efc255a..c312f05adf 100644
--- a/include/RAJA/policy/openmp/params/reduce.hpp
+++ b/include/RAJA/policy/openmp/params/reduce.hpp
@@ -3,37 +3,43 @@
 
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-  // Init
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  init(Reducer<OP, T, VOp>& red) {
-    red.m_valop.val = OP::identity();
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in) {
-    out.m_valop.val = OP{}(out.m_valop.val, in.m_valop.val);
-  }
-
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  resolve(Reducer<OP, T, VOp>& red) {
-    red.combineTarget(red.m_valop.val);
-  }
+// Init
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>>
+init(Reducer<OP, T, VOp>& red)
+{
+  red.m_valop.val = OP::identity();
+}
+
+// Combine
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>>
+combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in)
+{
+  out.m_valop.val = OP {}(out.m_valop.val, in.m_valop.val);
+}
+
+// Resolve
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>>
+resolve(Reducer<OP, T, VOp>& red)
+{
+  red.combineTarget(red.m_valop.val);
+}
 
 #endif
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
-#endif //  NEW_REDUCE_OMP_REDUCE_HPP
+#endif  //  NEW_REDUCE_OMP_REDUCE_HPP
diff --git a/include/RAJA/policy/openmp/policy.hpp b/include/RAJA/policy/openmp/policy.hpp
index aff2567474..6a1299065c 100644
--- a/include/RAJA/policy/openmp/policy.hpp
+++ b/include/RAJA/policy/openmp/policy.hpp
@@ -26,15 +26,16 @@
 #include "RAJA/policy/atomic_builtin.hpp"
 
 #if defined(RAJA_COMPILER_MSVC)
-typedef enum omp_sched_t { 
-    // schedule kinds 
-    omp_sched_static = 0x1, 
-    omp_sched_dynamic = 0x2, 
-    omp_sched_guided = 0x3, 
-    omp_sched_auto = 0x4, 
-    
-    // schedule modifier 
-    omp_sched_monotonic = 0x80000000u 
+typedef enum omp_sched_t
+{
+  // schedule kinds
+  omp_sched_static  = 0x1,
+  omp_sched_dynamic = 0x2,
+  omp_sched_guided  = 0x3,
+  omp_sched_auto    = 0x4,
+
+  // schedule modifier
+  omp_sched_monotonic = 0x80000000u
 } omp_sched_t;
 #else
 #include <omp.h>
@@ -51,7 +52,7 @@ enum struct multi_reduce_algorithm : int
   combine_on_get
 };
 
-template < multi_reduce_algorithm t_algorithm >
+template <multi_reduce_algorithm t_algorithm>
 struct MultiReduceTuning
 {
   static constexpr multi_reduce_algorithm algorithm = t_algorithm;
@@ -59,7 +60,7 @@ struct MultiReduceTuning
       (algorithm == multi_reduce_algorithm::combine_on_get);
 };
 
-} // namspace omp
+}  // namespace omp
 
 namespace policy
 {
@@ -68,14 +69,16 @@ namespace omp
 
 namespace internal
 {
-    struct ScheduleTag {};
-
-    template <omp_sched_t Sched, int Chunk>
-    struct Schedule : public ScheduleTag {
-        constexpr static omp_sched_t schedule = Sched;
-        constexpr static int chunk_size = Chunk;
-        constexpr static Policy policy = Policy::openmp;
-    };
+struct ScheduleTag
+{};
+
+template <omp_sched_t Sched, int Chunk>
+struct Schedule : public ScheduleTag
+{
+  constexpr static omp_sched_t schedule = Sched;
+  constexpr static int chunk_size       = Chunk;
+  constexpr static Policy policy        = Policy::openmp;
+};
 }  // namespace internal
 
 //
@@ -86,23 +89,23 @@ namespace internal
 //////////////////////////////////////////////////////////////////////
 //
 
-struct Parallel {
-};
+struct Parallel
+{};
 
-struct For {
-};
+struct For
+{};
 
-struct NoWait {
-};
+struct NoWait
+{};
 
 static constexpr int default_chunk_size = -1;
 
-struct Auto : public internal::Schedule<omp_sched_auto, default_chunk_size>{
-};
+struct Auto : public internal::Schedule<omp_sched_auto, default_chunk_size>
+{};
 
 template <int ChunkSize = default_chunk_size>
-struct Static : public internal::Schedule<omp_sched_static, ChunkSize> {
-};
+struct Static : public internal::Schedule<omp_sched_static, ChunkSize>
+{};
 
 template <int ChunkSize = default_chunk_size>
 using Dynamic = internal::Schedule<omp_sched_dynamic, ChunkSize>;
@@ -110,8 +113,9 @@ using Dynamic = internal::Schedule<omp_sched_dynamic, ChunkSize>;
 template <int ChunkSize = default_chunk_size>
 using Guided = internal::Schedule<omp_sched_guided, ChunkSize>;
 
-struct Runtime : private internal::Schedule<static_cast<omp_sched_t>(-1), default_chunk_size> {
-};
+struct Runtime : private internal::Schedule<static_cast<omp_sched_t>(-1),
+                                            default_chunk_size>
+{};
 
 //
 //////////////////////////////////////////////////////////////////////
@@ -122,39 +126,41 @@ struct Runtime : private internal::Schedule<static_cast<omp_sched_t>(-1), defaul
 //
 
 ///
-///  Struct supporting OpenMP parallel region. 
+///  Struct supporting OpenMP parallel region.
 ///
 struct omp_parallel_region
     : make_policy_pattern_launch_platform_t<Policy::openmp,
                                             Pattern::region,
                                             Launch::undefined,
-                                            Platform::host> {
-};
+                                            Platform::host>
+{};
 
 ///
 ///  Struct supporting OpenMP parallel region for Teams
 ///
-struct omp_launch_t
-    : make_policy_pattern_launch_platform_t<Policy::openmp,
-                                            Pattern::region,
-                                            Launch::undefined,
-                                            Platform::host> {
-};
+struct omp_launch_t : make_policy_pattern_launch_platform_t<Policy::openmp,
+                                                            Pattern::region,
+                                                            Launch::undefined,
+                                                            Platform::host>
+{};
 
 
 ///
 ///  Struct supporting OpenMP 'for nowait schedule( )'
 ///
 template <typename Sched>
-struct omp_for_nowait_schedule_exec : make_policy_pattern_launch_platform_t<Policy::openmp,
-                                                              Pattern::forall,
-                                                              Launch::undefined,
-                                                              Platform::host,
-                                                              omp::For,
-                                                              omp::NoWait,
-                                                              Sched> {
-    static_assert(std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
-        "Schedule type must be one of: Auto|Runtime|Static|Dynamic|Guided");
+struct omp_for_nowait_schedule_exec
+    : make_policy_pattern_launch_platform_t<Policy::openmp,
+                                            Pattern::forall,
+                                            Launch::undefined,
+                                            Platform::host,
+                                            omp::For,
+                                            omp::NoWait,
+                                            Sched>
+{
+  static_assert(
+      std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
+      "Schedule type must be one of: Auto|Runtime|Static|Dynamic|Guided");
 };
 
 
@@ -162,14 +168,17 @@ struct omp_for_nowait_schedule_exec : make_policy_pattern_launch_platform_t<Poli
 ///  Struct supporting OpenMP 'for schedule( )'
 ///
 template <typename Sched>
-struct omp_for_schedule_exec : make_policy_pattern_launch_platform_t<Policy::openmp,
-                                                              Pattern::forall,
-                                                              Launch::undefined,
-                                                              Platform::host,
-                                                              omp::For,
-                                                              Sched> {
-    static_assert(std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
-        "Schedule type must be one of: Auto|Runtime|Static|Dynamic|Guided");
+struct omp_for_schedule_exec
+    : make_policy_pattern_launch_platform_t<Policy::openmp,
+                                            Pattern::forall,
+                                            Launch::undefined,
+                                            Platform::host,
+                                            omp::For,
+                                            Sched>
+{
+  static_assert(
+      std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
+      "Schedule type must be one of: Auto|Runtime|Static|Dynamic|Guided");
 };
 
 ///
@@ -196,52 +205,58 @@ using omp_for_runtime_exec = omp_for_schedule_exec<omp::Runtime>;
 
 ///
 ///  Internal type aliases supporting 'omp for schedule( ) nowait' for specific
-///  schedule types. 
+///  schedule types.
 ///
 ///  IMPORTANT: We only provide a nowait policy option for static scheduling
 ///             since that is the only scheduling case that can be used with
-///             nowait and be correct in general. Paraphrasing the OpenMP 
+///             nowait and be correct in general. Paraphrasing the OpenMP
 ///             standard:
-///             
-///             Programs that depend on which thread executes a particular 
+///
+///             Programs that depend on which thread executes a particular
 ///             iteration under any circumstance other than static schedule
 ///             are non-conforming.
 ///
 template <int ChunkSize = default_chunk_size>
-using omp_for_nowait_static_exec = omp_for_nowait_schedule_exec<omp::Static<ChunkSize>>;
+using omp_for_nowait_static_exec =
+    omp_for_nowait_schedule_exec<omp::Static<ChunkSize>>;
 
 ///
 ///  Struct supporting OpenMP 'parallel' region containing an inner loop
 ///  execution construct.
 ///
 template <typename InnerPolicy>
-using omp_parallel_exec = make_policy_pattern_launch_platform_t<Policy::openmp,
-                                            Pattern::forall,
-                                            Launch::undefined,
-                                            Platform::host,
-                                            omp::Parallel,
-                                            wrapper<InnerPolicy>>;
+using omp_parallel_exec =
+    make_policy_pattern_launch_platform_t<Policy::openmp,
+                                          Pattern::forall,
+                                          Launch::undefined,
+                                          Platform::host,
+                                          omp::Parallel,
+                                          wrapper<InnerPolicy>>;
 
 ///
-///  Internal type aliases supporting 'omp parallel for schedule( )' for 
+///  Internal type aliases supporting 'omp parallel for schedule( )' for
 ///  specific schedule types.
 ///
 using omp_parallel_for_exec = omp_parallel_exec<omp_for_exec>;
 
 ///
 template <int ChunkSize = default_chunk_size>
-using omp_parallel_for_static_exec = omp_parallel_exec<omp_for_schedule_exec<omp::Static<ChunkSize>> >;
+using omp_parallel_for_static_exec =
+    omp_parallel_exec<omp_for_schedule_exec<omp::Static<ChunkSize>>>;
 
 ///
 template <int ChunkSize = default_chunk_size>
-using omp_parallel_for_dynamic_exec = omp_parallel_exec<omp_for_schedule_exec<omp::Dynamic<ChunkSize>> >;
+using omp_parallel_for_dynamic_exec =
+    omp_parallel_exec<omp_for_schedule_exec<omp::Dynamic<ChunkSize>>>;
 
 ///
 template <int ChunkSize = default_chunk_size>
-using omp_parallel_for_guided_exec = omp_parallel_exec<omp_for_schedule_exec<omp::Guided<ChunkSize>> >;
+using omp_parallel_for_guided_exec =
+    omp_parallel_exec<omp_for_schedule_exec<omp::Guided<ChunkSize>>>;
 
 ///
-using omp_parallel_for_runtime_exec = omp_parallel_exec<omp_for_schedule_exec<omp::Runtime>>;
+using omp_parallel_for_runtime_exec =
+    omp_parallel_exec<omp_for_schedule_exec<omp::Runtime>>;
 
 
 ///
@@ -265,13 +280,13 @@ using omp_parallel_segit = omp_parallel_for_segit;
 ///////////////////////////////////////////////////////////////////////
 ///
 struct omp_taskgraph_segit
-    : make_policy_pattern_t<Policy::openmp, Pattern::taskgraph, omp::Parallel> {
-};
+    : make_policy_pattern_t<Policy::openmp, Pattern::taskgraph, omp::Parallel>
+{};
 
 ///
 struct omp_taskgraph_interval_segit
-    : make_policy_pattern_t<Policy::openmp, Pattern::taskgraph, omp::Parallel> {
-};
+    : make_policy_pattern_t<Policy::openmp, Pattern::taskgraph, omp::Parallel>
+{};
 
 
 ///
@@ -284,8 +299,8 @@ struct omp_taskgraph_interval_segit
 struct omp_work : make_policy_pattern_launch_platform_t<Policy::openmp,
                                                         Pattern::workgroup_exec,
                                                         Launch::sync,
-                                                        Platform::host> {
-};
+                                                        Platform::host>
+{};
 
 ///
 ///////////////////////////////////////////////////////////////////////
@@ -294,31 +309,31 @@ struct omp_work : make_policy_pattern_launch_platform_t<Policy::openmp,
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
-struct omp_reduce : make_policy_pattern_t<Policy::openmp, Pattern::reduce> {
-};
+struct omp_reduce : make_policy_pattern_t<Policy::openmp, Pattern::reduce>
+{};
 
 ///
 struct omp_reduce_ordered
-    : make_policy_pattern_t<Policy::openmp, Pattern::reduce, reduce::ordered> {
-};
+    : make_policy_pattern_t<Policy::openmp, Pattern::reduce, reduce::ordered>
+{};
 
 ///
-template < typename tuning >
-struct omp_multi_reduce_policy
-    : make_policy_pattern_launch_platform_t<Policy::openmp,
-                                            Pattern::multi_reduce,
-                                            Launch::undefined,
-                                            Platform::host,
-                                            std::conditional_t<tuning::consistent,
-                                                               reduce::ordered,
-                                                               reduce::unordered>> {
-};
+template <typename tuning>
+struct omp_multi_reduce_policy : make_policy_pattern_launch_platform_t<
+                                     Policy::openmp,
+                                     Pattern::multi_reduce,
+                                     Launch::undefined,
+                                     Platform::host,
+                                     std::conditional_t<tuning::consistent,
+                                                        reduce::ordered,
+                                                        reduce::unordered>>
+{};
 
 ///
 struct omp_synchronize : make_policy_pattern_launch_t<Policy::openmp,
                                                       Pattern::synchronize,
-                                                      Launch::sync> {
-};
+                                                      Launch::sync>
+{};
 
 #if defined(RAJA_COMPILER_MSVC)
 
@@ -327,14 +342,15 @@ using omp_atomic = builtin_atomic;
 
 #else  // RAJA_COMPILER_MSVC not defined
 
-struct omp_atomic {};
+struct omp_atomic
+{};
 
 #endif
 
 
-template < RAJA::omp::multi_reduce_algorithm algorithm >
-using omp_multi_reduce_tuning = omp_multi_reduce_policy<
-    RAJA::omp::MultiReduceTuning<algorithm> >;
+template <RAJA::omp::multi_reduce_algorithm algorithm>
+using omp_multi_reduce_tuning =
+    omp_multi_reduce_policy<RAJA::omp::MultiReduceTuning<algorithm>>;
 
 // Policies for RAJA::MultiReduce* objects with specific behaviors.
 // - combine_on_destruction policies combine new values into a single value for
@@ -344,8 +360,8 @@ using omp_multi_reduce_combine_on_destruction = omp_multi_reduce_tuning<
     RAJA::omp::multi_reduce_algorithm::combine_on_destruction>;
 // - combine_on_get policies combine new values into a single value for
 //   each thread then when get is called those values are combined.
-using omp_multi_reduce_combine_on_get = omp_multi_reduce_tuning<
-    RAJA::omp::multi_reduce_algorithm::combine_on_get>;
+using omp_multi_reduce_combine_on_get =
+    omp_multi_reduce_tuning<RAJA::omp::multi_reduce_algorithm::combine_on_get>;
 
 // Policy for RAJA::MultiReduce* objects that gives the
 // same answer every time when used in the same way
@@ -395,18 +411,19 @@ using policy::omp::omp_parallel_for_segit;
 using policy::omp::omp_parallel_segit;
 
 ///
-/// Type alias for omp parallel region containing an inner 'omp for' loop 
+/// Type alias for omp parallel region containing an inner 'omp for' loop
 /// execution policy. Inner policy types follow.
 ///
 using policy::omp::omp_parallel_exec;
 
 ///
-/// Type alias for 'omp for' loop execution within an omp_parallel_exec construct
+/// Type alias for 'omp for' loop execution within an omp_parallel_exec
+/// construct
 ///
 using policy::omp::omp_for_exec;
 
 ///
-/// Type aliases for 'omp for' and 'omp for nowait' loop execution with a 
+/// Type aliases for 'omp for' and 'omp for nowait' loop execution with a
 /// scheduling policy within an omp_parallel_exec construct
 /// Scheduling policies are near the top of this file and include:
 /// RAJA::policy::omp::{Auto, Static, Dynamic, Guided, Runtime}
@@ -421,7 +438,7 @@ using policy::omp::omp_for_schedule_exec;
 using policy::omp::omp_for_nowait_schedule_exec;
 
 ///
-/// Type aliases for 'omp for' and 'omp for nowait' loop execution with a 
+/// Type aliases for 'omp for' and 'omp for nowait' loop execution with a
 /// static scheduling policy within an omp_parallel_exec construct
 ///
 using policy::omp::omp_for_static_exec;
@@ -437,8 +454,8 @@ using policy::omp::omp_for_runtime_exec;
 ///
 /// Type aliases for omp parallel region
 ///
-using policy::omp::omp_parallel_region;
 using policy::omp::omp_launch_t;
+using policy::omp::omp_parallel_region;
 
 ///
 /// Type aliases for omp reductions
diff --git a/include/RAJA/policy/openmp/reduce.hpp b/include/RAJA/policy/openmp/reduce.hpp
index 7ccc68c3a1..7fb0953c03 100644
--- a/include/RAJA/policy/openmp/reduce.hpp
+++ b/include/RAJA/policy/openmp/reduce.hpp
@@ -55,7 +55,8 @@ class ReduceOMP
 
   ~ReduceOMP()
   {
-    if (Base::parent) {
+    if (Base::parent)
+    {
 #pragma omp critical(ompReduceCritical)
       Reduce()(Base::parent->local(), Base::my_data);
       Base::my_data = Base::identity;
@@ -101,20 +102,22 @@ class ReduceOMPOrdered
 
   ~ReduceOMPOrdered()
   {
-    Reduce{}((*data)[omp_get_thread_num()], Base::my_data);
+    Reduce {}((*data)[omp_get_thread_num()], Base::my_data);
     Base::my_data = Base::identity;
   }
 
   T get_combined() const
   {
-    if (Base::my_data != Base::identity) {
-      Reduce{}((*data)[omp_get_thread_num()], Base::my_data);
+    if (Base::my_data != Base::identity)
+    {
+      Reduce {}((*data)[omp_get_thread_num()], Base::my_data);
       Base::my_data = Base::identity;
     }
 
     T res = Base::identity;
-    for (size_t i = 0; i < data->size(); ++i) {
-      Reduce{}(res, (*data)[i]);
+    for (size_t i = 0; i < data->size(); ++i)
+    {
+      Reduce {}(res, (*data)[i]);
     }
     return res;
   }
diff --git a/include/RAJA/policy/openmp/region.hpp b/include/RAJA/policy/openmp/region.hpp
index 88f0519abf..80f2dbd84a 100644
--- a/include/RAJA/policy/openmp/region.hpp
+++ b/include/RAJA/policy/openmp/region.hpp
@@ -35,15 +35,15 @@ namespace omp
  */
 
 template <typename Func>
-RAJA_INLINE void region_impl(const omp_parallel_region &, Func &&body)
+RAJA_INLINE void region_impl(const omp_parallel_region&, Func&& body)
 {
 
 #pragma omp parallel
-    { // curly brackets to ensure body() is encapsulated in omp parallel region
-      //thread private copy of body
-      auto loopbody = body;
-      loopbody();
-    }
+  {  // curly brackets to ensure body() is encapsulated in omp parallel region
+    // thread private copy of body
+    auto loopbody = body;
+    loopbody();
+  }
 }
 
 }  // namespace omp
diff --git a/include/RAJA/policy/openmp/scan.hpp b/include/RAJA/policy/openmp/scan.hpp
index 97cd7a8ab8..555075aeac 100644
--- a/include/RAJA/policy/openmp/scan.hpp
+++ b/include/RAJA/policy/openmp/scan.hpp
@@ -44,39 +44,39 @@ namespace scan
    initial value
 */
 template <typename Policy, typename Iter, typename BinFn>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_openmp_policy<Policy>>
-inclusive_inplace(
-    resources::Host host_res,
-    const Policy&,
-    Iter begin,
-    Iter end,
-    BinFn f)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_openmp_policy<Policy>>
+inclusive_inplace(resources::Host host_res,
+                  const Policy&,
+                  Iter begin,
+                  Iter end,
+                  BinFn f)
 {
-  using std::distance;
   using RAJA::detail::firstIndex;
-  using Value = typename ::std::iterator_traits<Iter>::value_type;
-  const auto n = distance(begin, end);
+  using std::distance;
+  using Value     = typename ::std::iterator_traits<Iter>::value_type;
+  const auto n    = distance(begin, end);
   using DistanceT = typename std::remove_const<decltype(n)>::type;
-  const int p0 = std::min(n, static_cast<DistanceT>(omp_get_max_threads()));
+  const int p0    = std::min(n, static_cast<DistanceT>(omp_get_max_threads()));
   ::std::vector<Value> sums(p0, Value());
 #pragma omp parallel num_threads(p0)
   {
-    const int p = omp_get_num_threads();
-    const int pid = omp_get_thread_num();
+    const int p               = omp_get_num_threads();
+    const int pid             = omp_get_thread_num();
     const DistanceT idx_begin = firstIndex(n, p, pid);
-    const DistanceT idx_end = firstIndex(n, p, pid + 1);
-    if (idx_begin != idx_end) {
-      inclusive_inplace(host_res, ::RAJA::seq_exec{},
-                        begin + idx_begin, begin + idx_end, f);
+    const DistanceT idx_end   = firstIndex(n, p, pid + 1);
+    if (idx_begin != idx_end)
+    {
+      inclusive_inplace(host_res, ::RAJA::seq_exec {}, begin + idx_begin,
+                        begin + idx_end, f);
       sums[pid] = begin[idx_end - 1];
     }
 #pragma omp barrier
 #pragma omp single
-    exclusive_inplace(host_res, ::RAJA::seq_exec{},
-                      sums.data(), sums.data() + p, f, BinFn::identity());
-    for (auto i = idx_begin; i < idx_end; ++i) {
+    exclusive_inplace(host_res, ::RAJA::seq_exec {}, sums.data(),
+                      sums.data() + p, f, BinFn::identity());
+    for (auto i = idx_begin; i < idx_end; ++i)
+    {
       begin[i] = f(begin[i], sums[pid]);
     }
   }
@@ -89,42 +89,42 @@ inclusive_inplace(
    initial value
 */
 template <typename Policy, typename Iter, typename BinFn, typename ValueT>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_openmp_policy<Policy>>
-exclusive_inplace(
-    resources::Host host_res,
-    const Policy&,
-    Iter begin,
-    Iter end,
-    BinFn f,
-    ValueT v)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_openmp_policy<Policy>>
+exclusive_inplace(resources::Host host_res,
+                  const Policy&,
+                  Iter begin,
+                  Iter end,
+                  BinFn f,
+                  ValueT v)
 {
-  using std::distance;
   using RAJA::detail::firstIndex;
-  using Value = typename ::std::iterator_traits<Iter>::value_type;
-  const auto n = distance(begin, end);
+  using std::distance;
+  using Value     = typename ::std::iterator_traits<Iter>::value_type;
+  const auto n    = distance(begin, end);
   using DistanceT = typename std::remove_const<decltype(n)>::type;
-  const int p0 = std::min(n, static_cast<DistanceT>(omp_get_max_threads()));
+  const int p0    = std::min(n, static_cast<DistanceT>(omp_get_max_threads()));
   ::std::vector<Value> sums(p0, v);
 #pragma omp parallel num_threads(p0)
   {
-    const int p = omp_get_num_threads();
-    const int pid = omp_get_thread_num();
+    const int p               = omp_get_num_threads();
+    const int pid             = omp_get_thread_num();
     const DistanceT idx_begin = firstIndex(n, p, pid);
-    const DistanceT idx_end = firstIndex(n, p, pid + 1);
-    const Value init = ((pid == 0) ? v : *(begin + idx_begin - 1));
+    const DistanceT idx_end   = firstIndex(n, p, pid + 1);
+    const Value init          = ((pid == 0) ? v : *(begin + idx_begin - 1));
 #pragma omp barrier
-    if (idx_begin != idx_end) {
-      exclusive_inplace(host_res, seq_exec{},
-                        begin + idx_begin, begin + idx_end, f, init);
+    if (idx_begin != idx_end)
+    {
+      exclusive_inplace(host_res, seq_exec {}, begin + idx_begin,
+                        begin + idx_end, f, init);
       sums[pid] = begin[idx_end - 1];
     }
 #pragma omp barrier
 #pragma omp single
-    exclusive_inplace(host_res, seq_exec{},
-                      sums.data(), sums.data() + p, f, BinFn::identity());
-    for (auto i = idx_begin; i < idx_end; ++i) {
+    exclusive_inplace(host_res, seq_exec {}, sums.data(), sums.data() + p, f,
+                      BinFn::identity());
+    for (auto i = idx_begin; i < idx_end; ++i)
+    {
       begin[i] = f(begin[i], sums[pid]);
     }
   }
@@ -137,16 +137,14 @@ exclusive_inplace(
    initial value
 */
 template <typename Policy, typename Iter, typename OutIter, typename BinFn>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_openmp_policy<Policy>>
-inclusive(
-    resources::Host host_res,
-    const Policy& exec,
-    Iter begin,
-    Iter end,
-    OutIter out,
-    BinFn f)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_openmp_policy<Policy>>
+inclusive(resources::Host host_res,
+          const Policy& exec,
+          Iter begin,
+          Iter end,
+          OutIter out,
+          BinFn f)
 {
   using std::distance;
   ::std::copy(begin, end, out);
@@ -162,21 +160,20 @@ template <typename Policy,
           typename OutIter,
           typename BinFn,
           typename ValueT>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_openmp_policy<Policy>>
-exclusive(
-    resources::Host host_res,
-    const Policy& exec,
-    Iter begin,
-    Iter end,
-    OutIter out,
-    BinFn f,
-    ValueT v)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_openmp_policy<Policy>>
+exclusive(resources::Host host_res,
+          const Policy& exec,
+          Iter begin,
+          Iter end,
+          OutIter out,
+          BinFn f,
+          ValueT v)
 {
   using std::distance;
   ::std::copy(begin, end, out);
-  return exclusive_inplace(host_res, exec, out, out + distance(begin, end), f, v);
+  return exclusive_inplace(host_res, exec, out, out + distance(begin, end), f,
+                           v);
 }
 
 }  // namespace scan
diff --git a/include/RAJA/policy/openmp/sort.hpp b/include/RAJA/policy/openmp/sort.hpp
index 9e4474d692..91f8c1d2a2 100644
--- a/include/RAJA/policy/openmp/sort.hpp
+++ b/include/RAJA/policy/openmp/sort.hpp
@@ -62,16 +62,18 @@ inline void sort_task(Sorter sorter,
                       RAJA::detail::IterDiff<Iter> iterates_per_task,
                       Compare comp)
 {
-  using diff_type = RAJA::detail::IterDiff<Iter>;
+  using diff_type   = RAJA::detail::IterDiff<Iter>;
   const diff_type n = i_end - i_begin;
 
-  if (n <= iterates_per_task) {
-
-    sorter(begin+i_begin, begin+i_end, comp);
+  if (n <= iterates_per_task)
+  {
 
-  } else {
+    sorter(begin + i_begin, begin + i_end, comp);
+  }
+  else
+  {
 
-    const diff_type i_middle = i_begin + n/2;
+    const diff_type i_middle = i_begin + n / 2;
 
 #pragma omp task
     sort_task(sorter, begin, i_begin, i_middle, iterates_per_task, comp);
@@ -81,8 +83,10 @@ inline void sort_task(Sorter sorter,
 
 #pragma omp taskwait
 
-    //std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end, comp);
-    RAJA::detail::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end, comp);
+    // std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end,
+    // comp);
+    RAJA::detail::inplace_merge(begin + i_begin, begin + i_middle,
+                                begin + i_end, comp);
   }
 }
 
@@ -114,20 +118,27 @@ inline void sort_parallel_region(Sorter sorter,
   }
 
   // hierarchically merge ranges
-  for (diff_type middle_offset = 1; middle_offset < num_threads; middle_offset *= 2) {
+  for (diff_type middle_offset = 1; middle_offset < num_threads;
+       middle_offset *= 2)
+  {
 
-    diff_type end_offset = 2*middle_offset;
+    diff_type end_offset = 2 * middle_offset;
 
-    const diff_type i_middle = firstIndex(n, num_threads, std::min(thread_id + middle_offset, num_threads));
-    const diff_type i_end    = firstIndex(n, num_threads, std::min(thread_id + end_offset,    num_threads));
+    const diff_type i_middle = firstIndex(
+        n, num_threads, std::min(thread_id + middle_offset, num_threads));
+    const diff_type i_end = firstIndex(
+        n, num_threads, std::min(thread_id + end_offset, num_threads));
 
 #pragma omp barrier
 
-    if (thread_id % end_offset == 0) {
+    if (thread_id % end_offset == 0)
+    {
 
       // this thread merges ranges [i_begin, i_middle) and [i_middle, i_end)
-      //std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end, comp);
-      RAJA::detail::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end, comp);
+      // std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end,
+      // comp);
+      RAJA::detail::inplace_merge(begin + i_begin, begin + i_middle,
+                                  begin + i_end, comp);
     }
   }
 }
@@ -139,11 +150,7 @@ inline void sort_parallel_region(Sorter sorter,
         \brief sort given range using sorter and comparison function
 */
 template <typename Sorter, typename Iter, typename Compare>
-inline
-void sort(Sorter sorter,
-          Iter begin,
-          Iter end,
-          Compare comp)
+inline void sort(Sorter sorter, Iter begin, Iter end, Compare comp)
 {
   using diff_type = RAJA::detail::IterDiff<Iter>;
 
@@ -151,20 +158,24 @@ void sort(Sorter sorter,
 
   const diff_type n = end - begin;
 
-  if (n <= min_iterates_per_task) {
+  if (n <= min_iterates_per_task)
+  {
 
     sorter(begin, end, comp);
-
-  } else {
+  }
+  else
+  {
 
     const diff_type max_threads = omp_get_max_threads();
 
 #if defined(RAJA_ENABLE_OPENMP_TASK_INTERNAL)
 
-    const diff_type iterates_per_task = std::max(n/(2*max_threads), min_iterates_per_task);
+    const diff_type iterates_per_task =
+        std::max(n / (2 * max_threads), min_iterates_per_task);
 
-    const diff_type requested_num_threads = std::min((n+iterates_per_task-1)/iterates_per_task, max_threads);
-    RAJA_UNUSED_VAR(requested_num_threads); // avoid warning in hip device code
+    const diff_type requested_num_threads =
+        std::min((n + iterates_per_task - 1) / iterates_per_task, max_threads);
+    RAJA_UNUSED_VAR(requested_num_threads);  // avoid warning in hip device code
 
 #pragma omp parallel num_threads(static_cast<int>(requested_num_threads))
 #pragma omp master
@@ -174,8 +185,9 @@ void sort(Sorter sorter,
 
 #else
 
-    const diff_type requested_num_threads = std::min((n+min_iterates_per_task-1)/min_iterates_per_task, max_threads);
-    RAJA_UNUSED_VAR(requested_num_threads); // avoid warning in hip device code
+    const diff_type requested_num_threads = std::min(
+        (n + min_iterates_per_task - 1) / min_iterates_per_task, max_threads);
+    RAJA_UNUSED_VAR(requested_num_threads);  // avoid warning in hip device code
 
 #pragma omp parallel num_threads(static_cast<int>(requested_num_threads))
     {
@@ -186,9 +198,9 @@ void sort(Sorter sorter,
   }
 }
 
-} // namespace openmp
+}  // namespace openmp
 
-} // namespace detail
+}  // namespace detail
 
 /*!
         \brief sort given range using comparison function
@@ -196,14 +208,13 @@ void sort(Sorter sorter,
 template <typename ExecPolicy, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_openmp_policy<ExecPolicy>>
-unstable(
-    resources::Host host_res,
-    const ExecPolicy&,
-    Iter begin,
-    Iter end,
-    Compare comp)
+unstable(resources::Host host_res,
+         const ExecPolicy&,
+         Iter begin,
+         Iter end,
+         Compare comp)
 {
-  detail::openmp::sort(detail::UnstableSorter{}, begin, end, comp);
+  detail::openmp::sort(detail::UnstableSorter {}, begin, end, comp);
 
   return resources::EventProxy<resources::Host>(host_res);
 }
@@ -214,14 +225,13 @@ unstable(
 template <typename ExecPolicy, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_openmp_policy<ExecPolicy>>
-stable(
-    resources::Host host_res,
-    const ExecPolicy&,
-    Iter begin,
-    Iter end,
-    Compare comp)
+stable(resources::Host host_res,
+       const ExecPolicy&,
+       Iter begin,
+       Iter end,
+       Compare comp)
 {
-  detail::openmp::sort(detail::StableSorter{}, begin, end, comp);
+  detail::openmp::sort(detail::StableSorter {}, begin, end, comp);
 
   return resources::EventProxy<resources::Host>(host_res);
 }
@@ -229,43 +239,50 @@ stable(
 /*!
         \brief sort given range of pairs using comparison function on keys
 */
-template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
+template <typename ExecPolicy,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_openmp_policy<ExecPolicy>>
-unstable_pairs(
-    resources::Host host_res,
-    const ExecPolicy&,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    Compare comp)
+unstable_pairs(resources::Host host_res,
+               const ExecPolicy&,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               Compare comp)
 {
-  auto begin  = RAJA::zip(keys_begin, vals_begin);
-  auto end    = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+  auto begin    = RAJA::zip(keys_begin, vals_begin);
+  auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
-  detail::openmp::sort(detail::UnstableSorter{}, begin, end, RAJA::compare_first<zip_ref>(comp));
+  detail::openmp::sort(detail::UnstableSorter {}, begin, end,
+                       RAJA::compare_first<zip_ref>(comp));
 
   return resources::EventProxy<resources::Host>(host_res);
 }
 
 /*!
-        \brief stable sort given range of pairs using comparison function on keys
+        \brief stable sort given range of pairs using comparison function on
+   keys
 */
-template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
+template <typename ExecPolicy,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_openmp_policy<ExecPolicy>>
-stable_pairs(
-    resources::Host host_res,
-    const ExecPolicy&,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    Compare comp)
+stable_pairs(resources::Host host_res,
+             const ExecPolicy&,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             Compare comp)
 {
-  auto begin  = RAJA::zip(keys_begin, vals_begin);
-  auto end    = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+  auto begin    = RAJA::zip(keys_begin, vals_begin);
+  auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
-  detail::openmp::sort(detail::StableSorter{}, begin, end, RAJA::compare_first<zip_ref>(comp));
+  detail::openmp::sort(detail::StableSorter {}, begin, end,
+                       RAJA::compare_first<zip_ref>(comp));
 
   return resources::EventProxy<resources::Host>(host_res);
 }
diff --git a/include/RAJA/policy/openmp_target.hpp b/include/RAJA/policy/openmp_target.hpp
index af88127636..018b3878d8 100644
--- a/include/RAJA/policy/openmp_target.hpp
+++ b/include/RAJA/policy/openmp_target.hpp
@@ -30,10 +30,11 @@
 #include "RAJA/policy/openmp_target/kernel.hpp"
 #include "RAJA/policy/openmp_target/forall.hpp"
 #include "RAJA/policy/openmp_target/reduce.hpp"
-//#include "RAJA/policy/openmp_target/multi_reduce.hpp"
+// #include "RAJA/policy/openmp_target/multi_reduce.hpp"
 #include "RAJA/policy/openmp_target/WorkGroup.hpp"
 
 
-#endif  // closing endif for if defined(RAJA_ENABLE_OPENMP) && defined(RAJA_ENABLE_TARGET_OPENMP)
+#endif  // closing endif for if defined(RAJA_ENABLE_OPENMP) &&
+        // defined(RAJA_ENABLE_TARGET_OPENMP)
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp
index a4a4a62903..6ace7460fd 100644
--- a/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp
@@ -36,12 +36,12 @@ namespace omp_target
 
 // create the value in a target region using the factory, map the value
 // back, and return the value created in the target region
-template < typename Factory >
+template <typename Factory>
 inline auto get_value(Factory factory)
 {
   typename std::decay_t<Factory>::value_type value;
 
-  #pragma omp target map(tofrom : value) map(to : factory)
+#pragma omp target map(tofrom : value) map(to : factory)
   {
     value = factory();
   }
@@ -51,7 +51,7 @@ inline auto get_value(Factory factory)
 
 // get the device value and store it so it can be used
 // multiple times
-template < typename Factory >
+template <typename Factory>
 inline auto get_cached_value(Factory&& factory)
 {
   static auto value = get_value(std::forward<Factory>(factory));
@@ -61,17 +61,18 @@ inline auto get_cached_value(Factory&& factory)
 }  // namespace omp_target
 
 /*!
-* Populate and return a Dispatcher object that can be used in omp target regions
-*/
-template < typename T, typename Dispatcher_T >
+ * Populate and return a Dispatcher object that can be used in omp target
+ * regions
+ */
+template <typename T, typename Dispatcher_T>
 inline const Dispatcher_T* get_Dispatcher(omp_target_work const&)
 {
-  static Dispatcher_T dispatcher{
-        Dispatcher_T::template makeDispatcher<T>(
-          [](auto&& factory) {
-            return omp_target::get_cached_value(
-                std::forward<decltype(factory)>(factory));
-          }) };
+  static Dispatcher_T dispatcher {Dispatcher_T::template makeDispatcher<T>(
+      [](auto&& factory)
+      {
+        return omp_target::get_cached_value(
+            std::forward<decltype(factory)>(factory));
+      })};
   return &dispatcher;
 }
 
diff --git a/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp
index b373d09c61..96c2323c33 100644
--- a/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp
@@ -38,23 +38,21 @@ namespace detail
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::omp_target_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallOrdered<
-        RAJA::omp_target_parallel_for_exec_nt,
-        RAJA::omp_target_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+          typename... Args>
+struct WorkRunner<RAJA::omp_target_work,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallOrdered<RAJA::omp_target_parallel_for_exec_nt,
+                              RAJA::omp_target_work,
+                              RAJA::ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
+{};
 
 /*!
  * Runs work in a storage container in reverse order
@@ -63,23 +61,21 @@ struct WorkRunner<
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::omp_target_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallReverse<
-        RAJA::omp_target_parallel_for_exec_nt,
-        RAJA::omp_target_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+          typename... Args>
+struct WorkRunner<RAJA::omp_target_work,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallReverse<RAJA::omp_target_parallel_for_exec_nt,
+                              RAJA::omp_target_work,
+                              RAJA::reverse_ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
+{};
 
 }  // namespace detail
 
diff --git a/include/RAJA/policy/openmp_target/forall.hpp b/include/RAJA/policy/openmp_target/forall.hpp
index 061481cbc1..c61a7d09f5 100644
--- a/include/RAJA/policy/openmp_target/forall.hpp
+++ b/include/RAJA/policy/openmp_target/forall.hpp
@@ -33,13 +33,15 @@ namespace omp
 /// OpenMP target parallel for policy implementation
 ///
 
-template <size_t ThreadsPerTeam, typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Omp>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-  >
+template <size_t ThreadsPerTeam,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Omp>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(resources::Omp omp_res,
             const omp_target_parallel_for_exec<ThreadsPerTeam>& p,
             Iterable&& iter,
@@ -51,33 +53,35 @@ forall_impl(resources::Omp omp_res,
   RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
-  Body body = loop_body;
+  Body body  = loop_body;
 
   RAJA_EXTRACT_BED_IT(iter);
 
   // Reset if exceed CUDA threads per block limit.
   int tperteam = ThreadsPerTeam;
-  if ( tperteam > omp::MAXNUMTHREADS )
+  if (tperteam > omp::MAXNUMTHREADS)
   {
     tperteam = omp::MAXNUMTHREADS;
   }
 
   // calculate number of teams based on user defined threads per team
   // datasize is distance between begin() and end() of iterable
-  auto numteams = RAJA_DIVIDE_CEILING_INT( distance_it, tperteam );
-  if ( numteams > tperteam )
+  auto numteams = RAJA_DIVIDE_CEILING_INT(distance_it, tperteam);
+  if (numteams > tperteam)
   {
     // Omp target reducers will write team # results, into Threads-sized array.
     // Need to insure NumTeams <= Threads to prevent array out of bounds access.
     numteams = tperteam;
   }
 
-// thread_limit(tperteam) unused due to XL seg fault (when tperteam != distance)
+  // thread_limit(tperteam) unused due to XL seg fault (when tperteam !=
+  // distance)
   auto i = distance_it;
 
-#pragma omp target teams distribute parallel for num_teams(numteams) \
-    schedule(static, 1) map(to : body,begin_it) reduction(combine: f_params)
-  for (i = 0; i < distance_it; ++i) {
+#pragma omp target teams distribute parallel for num_teams(numteams)           \
+    schedule(static, 1) map(to : body, begin_it) reduction(combine : f_params)
+  for (i = 0; i < distance_it; ++i)
+  {
     Body ib = body;
     RAJA::expt::invoke_body(f_params, ib, begin_it[i]);
   }
@@ -86,13 +90,14 @@ forall_impl(resources::Omp omp_res,
   return resources::EventProxy<resources::Omp>(omp_res);
 }
 
-template <size_t ThreadsPerTeam, typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Omp>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>
-  >
+template <size_t ThreadsPerTeam,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Omp>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Omp omp_res,
             const omp_target_parallel_for_exec<ThreadsPerTeam>&,
             Iterable&& iter,
@@ -100,33 +105,35 @@ forall_impl(resources::Omp omp_res,
             ForallParam)
 {
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
-  Body body = loop_body;
+  Body body  = loop_body;
 
   RAJA_EXTRACT_BED_IT(iter);
 
   // Reset if exceed CUDA threads per block limit.
   int tperteam = ThreadsPerTeam;
-  if ( tperteam > omp::MAXNUMTHREADS )
+  if (tperteam > omp::MAXNUMTHREADS)
   {
     tperteam = omp::MAXNUMTHREADS;
   }
 
   // calculate number of teams based on user defined threads per team
   // datasize is distance between begin() and end() of iterable
-  auto numteams = RAJA_DIVIDE_CEILING_INT( distance_it, tperteam );
-  if ( numteams > tperteam )
+  auto numteams = RAJA_DIVIDE_CEILING_INT(distance_it, tperteam);
+  if (numteams > tperteam)
   {
     // Omp target reducers will write team # results, into Threads-sized array.
     // Need to insure NumTeams <= Threads to prevent array out of bounds access.
     numteams = tperteam;
   }
 
-// thread_limit(tperteam) unused due to XL seg fault (when tperteam != distance)
+  // thread_limit(tperteam) unused due to XL seg fault (when tperteam !=
+  // distance)
   auto i = distance_it;
 
-#pragma omp target teams distribute parallel for num_teams(numteams) \
-    schedule(static, 1) map(to : body,begin_it)
-  for (i = 0; i < distance_it; ++i) {
+#pragma omp target teams distribute parallel for num_teams(numteams)           \
+    schedule(static, 1) map(to : body, begin_it)
+  for (i = 0; i < distance_it; ++i)
+  {
     Body ib = body;
     ib(begin_it[i]);
   }
@@ -135,16 +142,12 @@ forall_impl(resources::Omp omp_res,
 }
 
 
-
-
-
 template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Omp>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-  >
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Omp>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(resources::Omp omp_res,
             const omp_target_parallel_for_exec_nt& p,
             Iterable&& iter,
@@ -156,13 +159,14 @@ forall_impl(resources::Omp omp_res,
   RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
-  Body body = loop_body;
+  Body body  = loop_body;
 
   RAJA_EXTRACT_BED_IT(iter);
 
-#pragma omp target teams distribute parallel for schedule(static, 1) \
-    firstprivate(body,begin_it) reduction(combine: f_params)
-  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+#pragma omp target teams distribute parallel for schedule(static, 1)           \
+    firstprivate(body, begin_it) reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
     Body ib = body;
     RAJA::expt::invoke_body(f_params, ib, begin_it[i]);
   }
@@ -172,12 +176,10 @@ forall_impl(resources::Omp omp_res,
 }
 
 template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Omp>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>
-  >
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Omp>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Omp omp_res,
             const omp_target_parallel_for_exec_nt&,
             Iterable&& iter,
@@ -185,13 +187,14 @@ forall_impl(resources::Omp omp_res,
             ForallParam)
 {
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
-  Body body = loop_body;
+  Body body  = loop_body;
 
   RAJA_EXTRACT_BED_IT(iter);
 
-#pragma omp target teams distribute parallel for schedule(static, 1) \
-    firstprivate(body,begin_it)
-  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+#pragma omp target teams distribute parallel for schedule(static, 1)           \
+    firstprivate(body, begin_it)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
     Body ib = body;
     ib(begin_it[i]);
   }
diff --git a/include/RAJA/policy/openmp_target/kernel/Collapse.hpp b/include/RAJA/policy/openmp_target/kernel/Collapse.hpp
index b72147151c..22d2eb32d8 100644
--- a/include/RAJA/policy/openmp_target/kernel/Collapse.hpp
+++ b/include/RAJA/policy/openmp_target/kernel/Collapse.hpp
@@ -10,13 +10,19 @@
 
 #include "RAJA/pattern/kernel/internal.hpp"
 
-namespace RAJA {
-namespace internal {
+namespace RAJA
+{
+namespace internal
+{
 
-template <camp::idx_t Arg0, camp::idx_t Arg1, typename... EnclosedStmts, typename Types>
+template <camp::idx_t Arg0,
+          camp::idx_t Arg1,
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1>,
-                                             EnclosedStmts...>, Types>
+                                             EnclosedStmts...>,
+                         Types>
 {
   template <typename Data>
   static RAJA_INLINE void exec(Data&& data)
@@ -30,17 +36,20 @@ struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
 
     using RAJA::internal::thread_privatize;
     auto privatizer = thread_privatize(data);
-#pragma omp target teams distribute parallel for schedule(static, 1) \
+#pragma omp target teams distribute parallel for schedule(static, 1)           \
     firstprivate(privatizer) collapse(2)
-      for (auto i0 = (decltype(l0))0; i0 < l0; ++i0) {
-        for (auto i1 = (decltype(l1))0; i1 < l1; ++i1) {
-          auto& private_data = privatizer.get_priv();
-          private_data.template assign_offset<Arg0>(i0);
-          private_data.template assign_offset<Arg1>(i1);
-          execute_statement_list<camp::list<EnclosedStmts...>, NewTypes1>(private_data);
-        }
+    for (auto i0 = (decltype(l0))0; i0 < l0; ++i0)
+    {
+      for (auto i1 = (decltype(l1))0; i1 < l1; ++i1)
+      {
+        auto& private_data = privatizer.get_priv();
+        private_data.template assign_offset<Arg0>(i0);
+        private_data.template assign_offset<Arg1>(i1);
+        execute_statement_list<camp::list<EnclosedStmts...>, NewTypes1>(
+            private_data);
       }
     }
+  }
 };
 
 template <camp::idx_t Arg0,
@@ -50,7 +59,8 @@ template <camp::idx_t Arg0,
           typename Types>
 struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1, Arg2>,
-                                             EnclosedStmts...>, Types>
+                                             EnclosedStmts...>,
+                         Types>
 {
   template <typename Data>
   static RAJA_INLINE void exec(Data&& data)
@@ -66,20 +76,24 @@ struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
 
     using RAJA::internal::thread_privatize;
     auto privatizer = thread_privatize(data);
-#pragma omp target teams distribute parallel for schedule(static, 1) \
+#pragma omp target teams distribute parallel for schedule(static, 1)           \
     firstprivate(privatizer) collapse(3)
-      for (auto i0 = (decltype(l0))0; i0 < l0; ++i0) {
-        for (auto i1 = (decltype(l1))0; i1 < l1; ++i1) {
-          for (auto i2 = (decltype(l2))0; i2 < l2; ++i2) {
-            auto& private_data = privatizer.get_priv();
-            private_data.template assign_offset<Arg0>(i0);
-            private_data.template assign_offset<Arg1>(i1);
-            private_data.template assign_offset<Arg2>(i2);
-            execute_statement_list<camp::list<EnclosedStmts...>, NewTypes2>(private_data);
-          }
+    for (auto i0 = (decltype(l0))0; i0 < l0; ++i0)
+    {
+      for (auto i1 = (decltype(l1))0; i1 < l1; ++i1)
+      {
+        for (auto i2 = (decltype(l2))0; i2 < l2; ++i2)
+        {
+          auto& private_data = privatizer.get_priv();
+          private_data.template assign_offset<Arg0>(i0);
+          private_data.template assign_offset<Arg1>(i1);
+          private_data.template assign_offset<Arg2>(i2);
+          execute_statement_list<camp::list<EnclosedStmts...>, NewTypes2>(
+              private_data);
         }
       }
     }
+  }
 };
 
 template <camp::idx_t Arg0,
@@ -90,7 +104,8 @@ template <camp::idx_t Arg0,
           typename Types>
 struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1, Arg2, Arg3>,
-                                             EnclosedStmts...>, Types>
+                                             EnclosedStmts...>,
+                         Types>
 {
   template <typename Data>
   static RAJA_INLINE void exec(Data&& data)
@@ -108,26 +123,31 @@ struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
 
     using RAJA::internal::thread_privatize;
     auto privatizer = thread_privatize(data);
-#pragma omp target teams distribute parallel for schedule(static, 1) \
+#pragma omp target teams distribute parallel for schedule(static, 1)           \
     firstprivate(privatizer) collapse(4)
-      for (auto i0 = (decltype(l0))0; i0 < l0; ++i0) {
-        for (auto i1 = (decltype(l1))0; i1 < l1; ++i1) {
-          for (auto i2 = (decltype(l2))0; i2 < l2; ++i2) {
-            for (auto i3 = (decltype(l3))0; i3 < l3; ++i3) {
-              auto& private_data = privatizer.get_priv();
-              private_data.template assign_offset<Arg0>(i0);
-              private_data.template assign_offset<Arg1>(i1);
-              private_data.template assign_offset<Arg2>(i2);
-              private_data.template assign_offset<Arg3>(i2);
-              execute_statement_list<camp::list<EnclosedStmts...>, NewTypes3>(private_data);
-            }
+    for (auto i0 = (decltype(l0))0; i0 < l0; ++i0)
+    {
+      for (auto i1 = (decltype(l1))0; i1 < l1; ++i1)
+      {
+        for (auto i2 = (decltype(l2))0; i2 < l2; ++i2)
+        {
+          for (auto i3 = (decltype(l3))0; i3 < l3; ++i3)
+          {
+            auto& private_data = privatizer.get_priv();
+            private_data.template assign_offset<Arg0>(i0);
+            private_data.template assign_offset<Arg1>(i1);
+            private_data.template assign_offset<Arg2>(i2);
+            private_data.template assign_offset<Arg3>(i2);
+            execute_statement_list<camp::list<EnclosedStmts...>, NewTypes3>(
+                private_data);
           }
         }
       }
     }
+  }
 };
 
-}
-}
+}  // namespace internal
+}  // namespace RAJA
 
-#endif // RAJA_policy_openmp_target_kernel_Collapse_HPP
+#endif  // RAJA_policy_openmp_target_kernel_Collapse_HPP
diff --git a/include/RAJA/policy/openmp_target/kernel/For.hpp b/include/RAJA/policy/openmp_target/kernel/For.hpp
index 173230b9e2..38e48c4d24 100644
--- a/include/RAJA/policy/openmp_target/kernel/For.hpp
+++ b/include/RAJA/policy/openmp_target/kernel/For.hpp
@@ -10,25 +10,32 @@
 
 #include "RAJA/pattern/kernel/internal.hpp"
 
-namespace RAJA {
-namespace internal {
+namespace RAJA
+{
+namespace internal
+{
 
-template <camp::idx_t ArgumentId, typename Data, typename Types, typename... EnclosedStmts>
-struct OpenMPTargetForWrapper : public GenericWrapperBase 
+template <camp::idx_t ArgumentId,
+          typename Data,
+          typename Types,
+          typename... EnclosedStmts>
+struct OpenMPTargetForWrapper : public GenericWrapperBase
 {
   using data_t = camp::decay<Data>;
 
   data_t data;
 
-  /*! 
+  /*!
    * \brief Deferences data so that it can be mapped to the device
    */
   RAJA_INLINE
-  constexpr explicit OpenMPTargetForWrapper(data_t &d) : 
-    data{d}  {}
+  constexpr explicit OpenMPTargetForWrapper(data_t& d) : data {d} {}
 
   RAJA_INLINE
-  void exec() { execute_statement_list<camp::list<EnclosedStmts...>, Types>(data); }
+  void exec()
+  {
+    execute_statement_list<camp::list<EnclosedStmts...>, Types>(data);
+  }
 
   template <typename InIndexType>
   RAJA_INLINE void operator()(InIndexType i)
@@ -42,28 +49,33 @@ template <camp::idx_t ArgumentId,
           int N,
           typename... EnclosedStmts,
           typename Types>
-struct StatementExecutor<statement::For<ArgumentId, omp_target_parallel_for_exec<N>, EnclosedStmts...>, Types>
+struct StatementExecutor<statement::For<ArgumentId,
+                                        omp_target_parallel_for_exec<N>,
+                                        EnclosedStmts...>,
+                         Types>
 {
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE void exec(Data&& data)
   {
     // Set the argument type for this loop
     using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
-    OpenMPTargetForWrapper<ArgumentId, Data, NewTypes, EnclosedStmts...> for_wrapper(data);
+    OpenMPTargetForWrapper<ArgumentId, Data, NewTypes, EnclosedStmts...>
+        for_wrapper(data);
 
-    auto len = segment_length<ArgumentId>(data);
+    auto len    = segment_length<ArgumentId>(data);
     using len_t = decltype(len);
 
     auto r = resources::Omp::get_default();
-    forall_impl(r, omp_target_parallel_for_exec<N>{}, TypedRangeSegment<len_t>(0, len), for_wrapper, RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, omp_target_parallel_for_exec<N> {},
+                TypedRangeSegment<len_t>(0, len), for_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
 
+}  // namespace internal
+}  // namespace RAJA
 
-}
-}
-
-#endif // RAJA_policy_openmp_kernel_For_HPP
+#endif  // RAJA_policy_openmp_kernel_For_HPP
diff --git a/include/RAJA/policy/openmp_target/params/kernel_name.hpp b/include/RAJA/policy/openmp_target/params/kernel_name.hpp
index 5e9edb4b6c..3579269bdf 100644
--- a/include/RAJA/policy/openmp_target/params/kernel_name.hpp
+++ b/include/RAJA/policy/openmp_target/params/kernel_name.hpp
@@ -3,38 +3,42 @@
 
 #include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  init(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename T>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  combine(KernelName&, T& /*place holder argument*/) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  resolve(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
+// Init
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
+init(KernelName&)
+{
+  // TODO: Define kernel naming
+}
+
+// Combine
+template <typename EXEC_POL, typename T>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
+combine(KernelName&, T& /*place holder argument*/)
+{}
+
+// Resolve
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
+resolve(KernelName&)
+{
+  // TODO: Define kernel naming
+}
 
 #endif
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 
-#endif //  NEW_REDUCE_SEQ_REDUCE_HPP
+#endif  //  NEW_REDUCE_SEQ_REDUCE_HPP
diff --git a/include/RAJA/policy/openmp_target/params/reduce.hpp b/include/RAJA/policy/openmp_target/params/reduce.hpp
index 34c23fb5db..0364470945 100644
--- a/include/RAJA/policy/openmp_target/params/reduce.hpp
+++ b/include/RAJA/policy/openmp_target/params/reduce.hpp
@@ -3,37 +3,43 @@
 
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
-  // Init
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  init(Reducer<OP, T, VOp>& red) {
-    red.m_valop.val = OP::identity();
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in) {
-    out.m_valop.val = OP{}(out.m_valop.val, in.m_valop.val);
-  }
-
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  resolve(Reducer<OP, T, VOp>& red) {
-    red.combineTarget(red.m_valop.val);
-  }
+// Init
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
+init(Reducer<OP, T, VOp>& red)
+{
+  red.m_valop.val = OP::identity();
+}
+
+// Combine
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
+combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in)
+{
+  out.m_valop.val = OP {}(out.m_valop.val, in.m_valop.val);
+}
+
+// Resolve
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
+resolve(Reducer<OP, T, VOp>& red)
+{
+  red.combineTarget(red.m_valop.val);
+}
 
 #endif
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
-#endif //  NEW_REDUCE_OMP_REDUCE_HPP
+#endif  //  NEW_REDUCE_OMP_REDUCE_HPP
diff --git a/include/RAJA/policy/openmp_target/policy.hpp b/include/RAJA/policy/openmp_target/policy.hpp
index 520f5afc55..4e0b05a00c 100644
--- a/include/RAJA/policy/openmp_target/policy.hpp
+++ b/include/RAJA/policy/openmp_target/policy.hpp
@@ -10,10 +10,13 @@
 
 #include "RAJA/policy/PolicyBase.hpp"
 
-namespace RAJA {
+namespace RAJA
+{
 
-namespace policy {
-namespace omp {
+namespace policy
+{
+namespace omp
+{
 
 // Max number of CUDA reduction threads per block possible.
 // Required for allocating omp target data before execution policy.
@@ -21,47 +24,48 @@ namespace omp {
 static constexpr int MAXNUMTHREADS = 1024;
 
 template <unsigned int TeamSize>
-struct Teams : std::integral_constant<unsigned int, TeamSize> {
-};
+struct Teams : std::integral_constant<unsigned int, TeamSize>
+{};
 
-struct Target {
-};
+struct Target
+{};
 
-struct Distribute {
-};
+struct Distribute
+{};
 
-struct Collapse {
-};
+struct Collapse
+{};
 
 template <size_t ThreadsPerTeam>
 struct omp_target_parallel_for_exec
     : make_policy_pattern_platform_t<Policy::target_openmp,
-                            Pattern::forall,
-                            Platform::omp_target,
-                            omp::Target,
-                            omp::Teams<ThreadsPerTeam>,
-                            omp::Distribute> {
-};
+                                     Pattern::forall,
+                                     Platform::omp_target,
+                                     omp::Target,
+                                     omp::Teams<ThreadsPerTeam>,
+                                     omp::Distribute>
+{};
 
 struct omp_target_parallel_for_exec_nt
     : make_policy_pattern_platform_t<Policy::target_openmp,
-                            Pattern::forall,
-                            Platform::omp_target,
-                            omp::Target,
-                            omp::Distribute> {
-};
+                                     Pattern::forall,
+                                     Platform::omp_target,
+                                     omp::Target,
+                                     omp::Distribute>
+{};
 
 struct omp_target_parallel_collapse_exec
     : make_policy_pattern_platform_t<Policy::target_openmp,
-                            Pattern::forall,
-                            Platform::omp_target,
-                            omp::Target,
-                            omp::Collapse> {
-};
+                                     Pattern::forall,
+                                     Platform::omp_target,
+                                     omp::Target,
+                                     omp::Collapse>
+{};
 
-struct omp_target_reduce
-    : make_policy_pattern_platform_t<Policy::target_openmp, Pattern::reduce, Platform::omp_target> {
-};
+struct omp_target_reduce : make_policy_pattern_platform_t<Policy::target_openmp,
+                                                          Pattern::reduce,
+                                                          Platform::omp_target>
+{};
 
 ///
 /// WorkGroup execution policies
@@ -70,21 +74,21 @@ struct omp_target_work
     : make_policy_pattern_launch_platform_t<Policy::target_openmp,
                                             Pattern::workgroup_exec,
                                             Launch::sync,
-                                            Platform::omp_target> {
-};
+                                            Platform::omp_target>
+{};
 
 
-}  // closing brace for omp namespace
-}  // closing brace for policy namespace
+}  // namespace omp
+}  // namespace policy
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
+using policy::omp::omp_target_parallel_collapse_exec;
 using policy::omp::omp_target_parallel_for_exec;
 using policy::omp::omp_target_parallel_for_exec_nt;
 using policy::omp::omp_target_reduce;
-using policy::omp::omp_target_parallel_collapse_exec;
 using policy::omp::omp_target_work;
 #endif
 
-} // closing brace for RAJA namespace
+}  // namespace RAJA
 
-#endif // RAJA_policy_openmp_target_HPP
+#endif  // RAJA_policy_openmp_target_HPP
diff --git a/include/RAJA/policy/openmp_target/reduce.hpp b/include/RAJA/policy/openmp_target/reduce.hpp
index 6691729bbe..0470c52136 100644
--- a/include/RAJA/policy/openmp_target/reduce.hpp
+++ b/include/RAJA/policy/openmp_target/reduce.hpp
@@ -12,7 +12,7 @@
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
-//#include <cassert>  // Leaving out until XL is fixed 2/25/2019.
+// #include <cassert>  // Leaving out until XL is fixed 2/25/2019.
 
 #include <algorithm>
 
@@ -33,15 +33,14 @@ namespace omp
 #pragma omp declare target
 
 template <typename T, typename I>
-struct minloc 
+struct minloc
 {
   static constexpr T identity = T(::RAJA::operators::limits<T>::max());
-  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val,
-                                               I &loc,
-                                               const T v,
-                                               const I l)
+  RAJA_HOST_DEVICE RAJA_INLINE void
+  operator()(T& val, I& loc, const T v, const I l)
   {
-    if (v < val) {
+    if (v < val)
+    {
       loc = l;
       val = v;
     }
@@ -49,15 +48,14 @@ struct minloc
 };
 
 template <typename T, typename I>
-struct maxloc 
+struct maxloc
 {
   static constexpr T identity = T(::RAJA::operators::limits<T>::min());
-  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val,
-                                               I &loc,
-                                               const T v,
-                                               const I l)
+  RAJA_HOST_DEVICE RAJA_INLINE void
+  operator()(T& val, I& loc, const T v, const I l)
   {
-    if (v > val) {
+    if (v > val)
+    {
       loc = l;
       val = v;
     }
@@ -70,18 +68,19 @@ struct maxloc
 static constexpr int MaxNumTeams = policy::omp::MAXNUMTHREADS;
 
 //! Information necessary for OpenMP offload to be considered
-struct Offload_Info 
+struct Offload_Info
 {
-  int hostID{omp_get_initial_device()};
-  int deviceID{omp_get_default_device()};
-  bool isMapped{false};
+  int hostID {omp_get_initial_device()};
+  int deviceID {omp_get_default_device()};
+  bool isMapped {false};
 
   Offload_Info() = default;
 
-  Offload_Info(const Offload_Info &other)
-      : hostID{other.hostID}, deviceID{other.deviceID}, isMapped{other.isMapped}
-  {
-  }
+  Offload_Info(const Offload_Info& other)
+      : hostID {other.hostID},
+        deviceID {other.deviceID},
+        isMapped {other.isMapped}
+  {}
 };
 
 //! Reduction data for OpenMP Offload -- stores value, host pointer, and device
@@ -90,8 +89,8 @@ template <typename T>
 struct Reduce_Data
 {
   mutable T value;
-  T *device;
-  T *host;
+  T* device;
+  T* host;
 
   //! disallow default constructor
   Reduce_Data() = delete;
@@ -100,17 +99,19 @@ struct Reduce_Data
    *
    *  allocates data on the host and device and initializes values to default
    */
-  Reduce_Data(T initValue, T identityValue, Offload_Info &info)
-     : value(initValue),
-        device{reinterpret_cast<T *>(
+  Reduce_Data(T initValue, T identityValue, Offload_Info& info)
+      : value(initValue),
+        device {reinterpret_cast<T*>(
             omp_target_alloc(omp::MaxNumTeams * sizeof(T), info.deviceID))},
-        host{new T[omp::MaxNumTeams]}
+        host {new T[omp::MaxNumTeams]}
   {
-    if (!host) {
+    if (!host)
+    {
       printf("Unable to allocate space on host\n");
       exit(1);
     }
-    if (!device) {
+    if (!device)
+    {
       printf("Unable to allocate space on device\n");
       exit(1);
     }
@@ -118,55 +119,50 @@ struct Reduce_Data
     hostToDevice(info);
   }
 
-  void reset(T initValue)
-  {
-    value = initValue;
-  }
+  void reset(T initValue) { value = initValue; }
 
 
   //! default copy constructor for POD
-  Reduce_Data(const Reduce_Data &) = default;
+  Reduce_Data(const Reduce_Data&) = default;
 
   //! transfers from the host to the device -- exit() is called upon failure
-  RAJA_INLINE void hostToDevice(Offload_Info &info)
+  RAJA_INLINE void hostToDevice(Offload_Info& info)
   {
     // precondition: host and device are valid pointers
-    if (omp_target_memcpy(reinterpret_cast<void *>(device),
-                          reinterpret_cast<void *>(host),
-                          omp::MaxNumTeams * sizeof(T),
-                          0,
-                          0,
-                          info.deviceID,
-                          info.hostID) != 0) {
+    if (omp_target_memcpy(reinterpret_cast<void*>(device),
+                          reinterpret_cast<void*>(host),
+                          omp::MaxNumTeams * sizeof(T), 0, 0, info.deviceID,
+                          info.hostID) != 0)
+    {
       printf("Unable to copy memory from host to device\n");
       exit(1);
     }
   }
 
   //! transfers from the device to the host -- exit() is called upon failure
-  RAJA_INLINE void deviceToHost(Offload_Info &info)
+  RAJA_INLINE void deviceToHost(Offload_Info& info)
   {
     // precondition: host and device are valid pointers
-    if (omp_target_memcpy(reinterpret_cast<void *>(host),
-                          reinterpret_cast<void *>(device),
-                          omp::MaxNumTeams * sizeof(T),
-                          0,
-                          0,
-                          info.hostID,
-                          info.deviceID) != 0) {
+    if (omp_target_memcpy(reinterpret_cast<void*>(host),
+                          reinterpret_cast<void*>(device),
+                          omp::MaxNumTeams * sizeof(T), 0, 0, info.hostID,
+                          info.deviceID) != 0)
+    {
       printf("Unable to copy memory from device to host\n");
       exit(1);
     }
   }
 
   //! frees all data from the offload information passed
-  RAJA_INLINE void cleanup(Offload_Info &info)
+  RAJA_INLINE void cleanup(Offload_Info& info)
   {
-    if (device) {
-      omp_target_free(reinterpret_cast<void *>(device), info.deviceID);
+    if (device)
+    {
+      omp_target_free(reinterpret_cast<void*>(device), info.deviceID);
       device = nullptr;
     }
-    if (host) {
+    if (host)
+    {
       delete[] host;
       host = nullptr;
     }
@@ -178,77 +174,80 @@ struct Reduce_Data
 //! OpenMP Target Reduction entity -- generalize on # of teams, reduction, and
 //! type
 template <typename Reducer, typename T>
-struct TargetReduce 
+struct TargetReduce
 {
-  TargetReduce() = delete;
-  TargetReduce(const TargetReduce &) = default;
+  TargetReduce()                    = delete;
+  TargetReduce(const TargetReduce&) = default;
 
   explicit TargetReduce(T init_val_, T identity_ = Reducer::identity())
       : info(),
         val(identity_, identity_, info),
         initVal(init_val_),
         finalVal(identity_)
-  {
-  }
+  {}
 
   void reset(T init_val_, T identity_ = Reducer::identity())
   {
     operator T();
     val.reset(identity_);
-    initVal = init_val_;
+    initVal  = init_val_;
     finalVal = identity_;
   }
 
-#ifdef __ibmxl__ // TODO: implicit declare target doesn't pick this up
+#ifdef __ibmxl__  // TODO: implicit declare target doesn't pick this up
 #pragma omp declare target
 #endif
   //! apply reduction on device upon destruction
   ~TargetReduce()
   {
-    //assert ( omp_get_num_teams() <= omp::MaxNumTeams );  // Leaving out until XL is fixed 2/25/2019.
-    if (!omp_is_initial_device()) {
+    // assert ( omp_get_num_teams() <= omp::MaxNumTeams );  // Leaving out until
+    // XL is fixed 2/25/2019.
+    if (!omp_is_initial_device())
+    {
 #pragma omp critical
       {
         int tid = omp_get_team_num();
-        Reducer{}(val.device[tid], val.value);
+        Reducer {}(val.device[tid], val.value);
       }
     }
   }
-#ifdef __ibmxl__ // TODO: implicit declare target doesn't pick this up
+#ifdef __ibmxl__  // TODO: implicit declare target doesn't pick this up
 #pragma omp end declare target
 #endif
 
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    if (!info.isMapped) {
+    if (!info.isMapped)
+    {
       val.deviceToHost(info);
 
-      for (int i = 0; i < omp::MaxNumTeams; ++i) {
-        Reducer{}(val.value, val.host[i]);
+      for (int i = 0; i < omp::MaxNumTeams; ++i)
+      {
+        Reducer {}(val.value, val.host[i]);
       }
       val.cleanup(info);
       info.isMapped = true;
     }
     finalVal = Reducer::identity();
-    Reducer{}(finalVal, initVal);
-    Reducer{}(finalVal, val.value);
+    Reducer {}(finalVal, initVal);
+    Reducer {}(finalVal, val.value);
     return finalVal;
   }
   //! alias for operator T()
   T get() { return operator T(); }
 
   //! apply reduction
-  TargetReduce &reduce(T rhsVal)
+  TargetReduce& reduce(T rhsVal)
   {
-    Reducer{}(val.value, rhsVal);
+    Reducer {}(val.value, rhsVal);
     return *this;
   }
 
   //! apply reduction (const version) -- still reduces internal values
-  const TargetReduce &reduce(T rhsVal) const
+  const TargetReduce& reduce(T rhsVal) const
   {
-    Reducer{}(val.value, rhsVal);
+    Reducer {}(val.value, rhsVal);
     return *this;
   }
 
@@ -264,13 +263,16 @@ struct TargetReduce
 //! OpenMP Target Reduction Location entity -- generalize on # of teams,
 //! reduction, and type
 template <typename Reducer, typename T, typename IndexType>
-struct TargetReduceLoc 
+struct TargetReduceLoc
 {
-  TargetReduceLoc() = delete;
-  TargetReduceLoc(const TargetReduceLoc &) = default;
-  explicit TargetReduceLoc(T init_val_, IndexType init_loc,
-                           T identity_val_ = Reducer::identity,
-                           IndexType identity_loc_ = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+  TargetReduceLoc()                       = delete;
+  TargetReduceLoc(const TargetReduceLoc&) = default;
+  explicit TargetReduceLoc(
+      T init_val_,
+      IndexType init_loc,
+      T identity_val_ = Reducer::identity,
+      IndexType identity_loc_ =
+          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
       : info(),
         val(identity_val_, identity_val_, info),
         loc(identity_loc_, identity_loc_, info),
@@ -278,31 +280,34 @@ struct TargetReduceLoc
         finalVal(identity_val_),
         initLoc(init_loc),
         finalLoc(identity_loc_)
-  {
-  }
+  {}
 
-  void reset(T init_val_, IndexType init_loc_,
+  void reset(T init_val_,
+             IndexType init_loc_,
              T identity_val_ = Reducer::identity,
-             IndexType identity_loc_ = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_loc_ =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
     operator T();
     val.reset(identity_val_);
     loc.reset(identity_loc_);
-    initVal = init_val_;
+    initVal  = init_val_;
     finalVal = identity_val_;
-    initLoc = init_loc_;
+    initLoc  = init_loc_;
     finalLoc = identity_loc_;
   }
 
   //! apply reduction on device upon destruction
   ~TargetReduceLoc()
   {
-    //assert ( omp_get_num_teams() <= omp::MaxNumTeams );  // Leaving out until XL is fixed 2/25/2019.
-    if (!omp_is_initial_device()) {
+    // assert ( omp_get_num_teams() <= omp::MaxNumTeams );  // Leaving out until
+    // XL is fixed 2/25/2019.
+    if (!omp_is_initial_device())
+    {
 #pragma omp critical
       {
         int tid = omp_get_team_num();
-        Reducer{}(val.device[tid], loc.device[tid], val.value, loc.value);
+        Reducer {}(val.device[tid], loc.device[tid], val.value, loc.value);
       }
     }
   }
@@ -310,11 +315,13 @@ struct TargetReduceLoc
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    if (!info.isMapped) {
+    if (!info.isMapped)
+    {
       val.deviceToHost(info);
       loc.deviceToHost(info);
-      for (int i = 0; i < omp::MaxNumTeams; ++i) {
-        Reducer{}(val.value, loc.value, val.host[i], loc.host[i]);
+      for (int i = 0; i < omp::MaxNumTeams; ++i)
+      {
+        Reducer {}(val.value, loc.value, val.host[i], loc.host[i]);
       }
       val.cleanup(info);
       loc.cleanup(info);
@@ -322,8 +329,8 @@ struct TargetReduceLoc
     }
     finalVal = Reducer::identity;
     finalLoc = IndexType(RAJA::reduce::detail::DefaultLoc<IndexType>().value());
-    Reducer{}(finalVal, finalLoc, initVal, initLoc);
-    Reducer{}(finalVal, finalLoc, val.value, loc.value);
+    Reducer {}(finalVal, finalLoc, initVal, initLoc);
+    Reducer {}(finalVal, finalLoc, val.value, loc.value);
     return finalVal;
   }
   //! alias for operator T()
@@ -339,16 +346,16 @@ struct TargetReduceLoc
   }
 
   //! apply reduction
-  TargetReduceLoc &reduce(T rhsVal, IndexType rhsLoc)
+  TargetReduceLoc& reduce(T rhsVal, IndexType rhsLoc)
   {
-    Reducer{}(val.value, loc.value, rhsVal, rhsLoc);
+    Reducer {}(val.value, loc.value, rhsVal, rhsLoc);
     return *this;
   }
 
   //! apply reduction (const version) -- still reduces internal values
-  const TargetReduceLoc &reduce(T rhsVal, IndexType rhsLoc) const
+  const TargetReduceLoc& reduce(T rhsVal, IndexType rhsLoc) const
   {
-    Reducer{}(val.value, loc.value, rhsVal, rhsLoc);
+    Reducer {}(val.value, loc.value, rhsVal, rhsLoc);
     return *this;
   }
 
@@ -372,20 +379,19 @@ class ReduceSum<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::sum<T>, T>
 {
 public:
-
-  using self = ReduceSum<omp_target_reduce, T>;
+  using self   = ReduceSum<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::sum<T>, T>;
   using parent::parent;
 
   //! enable operator+= for ReduceSum -- alias for reduce()
-  self &operator+=(T rhsVal)
+  self& operator+=(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable operator+= for ReduceSum -- alias for reduce()
-  const self &operator+=(T rhsVal) const
+  const self& operator+=(T rhsVal) const
   {
     parent::reduce(rhsVal);
     return *this;
@@ -398,20 +404,19 @@ class ReduceBitOr<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::or_bit<T>, T>
 {
 public:
-
-  using self = ReduceBitOr<omp_target_reduce, T>;
+  using self   = ReduceBitOr<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::or_bit<T>, T>;
   using parent::parent;
 
   //! enable operator|= for ReduceBitOr -- alias for reduce()
-  self &operator|=(T rhsVal)
+  self& operator|=(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable operator|= for ReduceBitOr -- alias for reduce()
-  const self &operator|=(T rhsVal) const
+  const self& operator|=(T rhsVal) const
   {
     parent::reduce(rhsVal);
     return *this;
@@ -424,20 +429,19 @@ class ReduceBitAnd<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::and_bit<T>, T>
 {
 public:
-
-  using self = ReduceBitAnd<omp_target_reduce, T>;
+  using self   = ReduceBitAnd<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::and_bit<T>, T>;
   using parent::parent;
 
   //! enable operator&= for ReduceBitAnd -- alias for reduce()
-  self &operator&=(T rhsVal)
+  self& operator&=(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable operator&= for ReduceBitAnd -- alias for reduce()
-  const self &operator&=(T rhsVal) const
+  const self& operator&=(T rhsVal) const
   {
     parent::reduce(rhsVal);
     return *this;
@@ -450,20 +454,19 @@ class ReduceMin<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::min<T>, T>
 {
 public:
-
-  using self = ReduceMin<omp_target_reduce, T>;
+  using self   = ReduceMin<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::min<T>, T>;
   using parent::parent;
 
   //! enable min() for ReduceMin -- alias for reduce()
-  self &min(T rhsVal)
+  self& min(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable min() for ReduceMin -- alias for reduce()
-  const self &min(T rhsVal) const
+  const self& min(T rhsVal) const
   {
     parent::reduce(rhsVal);
     return *this;
@@ -477,20 +480,19 @@ class ReduceMax<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::max<T>, T>
 {
 public:
-
-  using self = ReduceMax<omp_target_reduce, T>;
+  using self   = ReduceMax<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::max<T>, T>;
   using parent::parent;
 
   //! enable max() for ReduceMax -- alias for reduce()
-  self &max(T rhsVal)
+  self& max(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable max() for ReduceMax -- alias for reduce()
-  const self &max(T rhsVal) const
+  const self& max(T rhsVal) const
   {
     parent::reduce(rhsVal);
     return *this;
@@ -503,21 +505,19 @@ class ReduceMinLoc<omp_target_reduce, T, IndexType>
     : public TargetReduceLoc<omp::minloc<T, IndexType>, T, IndexType>
 {
 public:
-
-  using self = ReduceMinLoc<omp_target_reduce, T, IndexType>;
-  using parent =
-      TargetReduceLoc<omp::minloc<T, IndexType>, T, IndexType>;
+  using self   = ReduceMinLoc<omp_target_reduce, T, IndexType>;
+  using parent = TargetReduceLoc<omp::minloc<T, IndexType>, T, IndexType>;
   using parent::parent;
 
   //! enable minloc() for ReduceMinLoc -- alias for reduce()
-  self &minloc(T rhsVal, IndexType rhsLoc)
+  self& minloc(T rhsVal, IndexType rhsLoc)
   {
     parent::reduce(rhsVal, rhsLoc);
     return *this;
   }
 
   //! enable minloc() for ReduceMinLoc -- alias for reduce()
-  const self &minloc(T rhsVal, IndexType rhsLoc) const
+  const self& minloc(T rhsVal, IndexType rhsLoc) const
   {
     parent::reduce(rhsVal, rhsLoc);
     return *this;
@@ -531,21 +531,19 @@ class ReduceMaxLoc<omp_target_reduce, T, IndexType>
     : public TargetReduceLoc<omp::maxloc<T, IndexType>, T, IndexType>
 {
 public:
-
-  using self = ReduceMaxLoc<omp_target_reduce, T, IndexType>;
-  using parent =
-      TargetReduceLoc<omp::maxloc<T, IndexType>, T, IndexType>;
+  using self   = ReduceMaxLoc<omp_target_reduce, T, IndexType>;
+  using parent = TargetReduceLoc<omp::maxloc<T, IndexType>, T, IndexType>;
   using parent::parent;
 
   //! enable maxloc() for ReduceMaxLoc -- alias for reduce()
-  self &maxloc(T rhsVal, IndexType rhsLoc)
+  self& maxloc(T rhsVal, IndexType rhsLoc)
   {
     parent::reduce(rhsVal, rhsLoc);
     return *this;
   }
 
   //! enable maxloc() for ReduceMaxLoc -- alias for reduce()
-  const self &maxloc(T rhsVal, IndexType rhsLoc) const
+  const self& maxloc(T rhsVal, IndexType rhsLoc) const
   {
     parent::reduce(rhsVal, rhsLoc);
     return *this;
diff --git a/include/RAJA/policy/sequential.hpp b/include/RAJA/policy/sequential.hpp
index 0963b31a01..90c6cb85ed 100644
--- a/include/RAJA/policy/sequential.hpp
+++ b/include/RAJA/policy/sequential.hpp
@@ -21,7 +21,7 @@
 #define RAJA_sequential_HPP
 
 #if !defined(RAJA_ENABLE_DESUL_ATOMICS)
-    #include "RAJA/policy/sequential/atomic.hpp"
+#include "RAJA/policy/sequential/atomic.hpp"
 #endif
 
 #include "RAJA/policy/sequential/forall.hpp"
diff --git a/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp
index 13796fd8a3..ab97dbd3cf 100644
--- a/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp
@@ -32,12 +32,12 @@ namespace detail
 {
 
 /*!
-* Populate and return a Dispatcher object
-*/
-template < typename T, typename Dispatcher_T >
+ * Populate and return a Dispatcher object
+ */
+template <typename T, typename Dispatcher_T>
 inline const Dispatcher_T* get_Dispatcher(seq_work const&)
 {
-  static Dispatcher_T dispatcher{ Dispatcher_T::template makeDispatcher<T>() };
+  static Dispatcher_T dispatcher {Dispatcher_T::template makeDispatcher<T>()};
   return &dispatcher;
 }
 
diff --git a/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp
index 31e401bf88..b2b6f11bba 100644
--- a/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp
@@ -38,23 +38,20 @@ namespace detail
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::seq_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallOrdered<
-        RAJA::seq_exec,
-        RAJA::seq_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+          typename... Args>
+struct WorkRunner<RAJA::seq_work,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...> : WorkRunnerForallOrdered<RAJA::seq_exec,
+                                                     RAJA::seq_work,
+                                                     RAJA::ordered,
+                                                     DISPATCH_POLICY_T,
+                                                     ALLOCATOR_T,
+                                                     INDEX_T,
+                                                     Args...>
+{};
 
 /*!
  * Runs work in a storage container in reverse order
@@ -63,23 +60,20 @@ struct WorkRunner<
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::seq_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallReverse<
-        RAJA::seq_exec,
-        RAJA::seq_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+          typename... Args>
+struct WorkRunner<RAJA::seq_work,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...> : WorkRunnerForallReverse<RAJA::seq_exec,
+                                                     RAJA::seq_work,
+                                                     RAJA::reverse_ordered,
+                                                     DISPATCH_POLICY_T,
+                                                     ALLOCATOR_T,
+                                                     INDEX_T,
+                                                     Args...>
+{};
 
 }  // namespace detail
 
diff --git a/include/RAJA/policy/sequential/atomic.hpp b/include/RAJA/policy/sequential/atomic.hpp
index 046e52e1c1..a9e5e4f256 100644
--- a/include/RAJA/policy/sequential/atomic.hpp
+++ b/include/RAJA/policy/sequential/atomic.hpp
@@ -27,24 +27,21 @@ namespace RAJA
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicLoad(seq_atomic, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicLoad(seq_atomic, T* acc)
 {
   return *acc;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE void atomicStore(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE void atomicStore(seq_atomic, T* acc, T value)
 {
   *acc = value;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAdd(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAdd(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
   *acc += value;
@@ -54,8 +51,7 @@ RAJA_INLINE T atomicAdd(seq_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicSub(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicSub(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
   *acc -= value;
@@ -65,29 +61,26 @@ RAJA_INLINE T atomicSub(seq_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMin(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMin(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
-  *acc = ret < value ? ret : value;
+  *acc  = ret < value ? ret : value;
   return ret;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMax(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMax(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
-  *acc = value < ret ? ret : value;
+  *acc  = value < ret ? ret : value;
   return ret;
 }
 
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(seq_atomic, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(seq_atomic, T* acc)
 {
   T ret = *acc;
   (*acc) += T(1);
@@ -96,18 +89,16 @@ RAJA_INLINE T atomicInc(seq_atomic, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(seq_atomic, T *acc, T val)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(seq_atomic, T* acc, T val)
 {
   T old = *acc;
-  *acc = val <= old ? T(0) : old + T(1);
+  *acc  = val <= old ? T(0) : old + T(1);
   return old;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(seq_atomic, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(seq_atomic, T* acc)
 {
   T ret = *acc;
   (*acc) -= T(1);
@@ -116,18 +107,16 @@ RAJA_INLINE T atomicDec(seq_atomic, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(seq_atomic, T *acc, T val)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(seq_atomic, T* acc, T val)
 {
   T old = *acc;
-  *acc = old == T(0) || val < old ? val : old - T(1);
+  *acc  = old == T(0) || val < old ? val : old - T(1);
   return old;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAnd(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAnd(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
   *acc &= value;
@@ -136,8 +125,7 @@ RAJA_INLINE T atomicAnd(seq_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicOr(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicOr(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
   *acc |= value;
@@ -146,8 +134,7 @@ RAJA_INLINE T atomicOr(seq_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicXor(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicXor(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
   *acc ^= value;
@@ -156,21 +143,19 @@ RAJA_INLINE T atomicXor(seq_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicExchange(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicExchange(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
-  *acc = value;
+  *acc  = value;
   return ret;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicCAS(seq_atomic, T *acc, T compare, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicCAS(seq_atomic, T* acc, T compare, T value)
 {
   T ret = *acc;
-  *acc = ret == compare ? value : ret;
+  *acc  = ret == compare ? value : ret;
   return ret;
 }
 
diff --git a/include/RAJA/policy/sequential/forall.hpp b/include/RAJA/policy/sequential/forall.hpp
index 5d1d6d84b0..4bf9f1607a 100644
--- a/include/RAJA/policy/sequential/forall.hpp
+++ b/include/RAJA/policy/sequential/forall.hpp
@@ -55,24 +55,26 @@ namespace sequential
 //////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable, typename Func, typename Resource, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<Resource>,
-  expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-  >
+template <typename Iterable,
+          typename Func,
+          typename Resource,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Resource>,
+    expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(Resource res,
-            const seq_exec &,
-            Iterable &&iter,
-            Func &&body,
+            const seq_exec&,
+            Iterable&& iter,
+            Func&& body,
             ForallParam f_params)
 {
   expt::ParamMultiplexer::init<seq_exec>(f_params);
 
   RAJA_EXTRACT_BED_IT(iter);
 
-  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
     expt::invoke_body(f_params, body, *(begin_it + i));
   }
 
@@ -80,22 +82,24 @@ forall_impl(Resource res,
   return resources::EventProxy<Resource>(res);
 }
 
-template <typename Iterable, typename Func, typename Resource, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<Resource>,
-  expt::type_traits::is_ForallParamPack<ForallParam>,
-  expt::type_traits::is_ForallParamPack_empty<ForallParam>
-  >
+template <typename Iterable,
+          typename Func,
+          typename Resource,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Resource>,
+    expt::type_traits::is_ForallParamPack<ForallParam>,
+    expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(Resource res,
-            const seq_exec &,
-            Iterable &&iter,
-            Func &&body,
+            const seq_exec&,
+            Iterable&& iter,
+            Func&& body,
             ForallParam)
 {
   RAJA_EXTRACT_BED_IT(iter);
 
-  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
     body(*(begin_it + i));
   }
   return resources::EventProxy<Resource>(res);
diff --git a/include/RAJA/policy/sequential/kernel/Collapse.hpp b/include/RAJA/policy/sequential/kernel/Collapse.hpp
index 8e600ec2e8..a722b89ff8 100644
--- a/include/RAJA/policy/sequential/kernel/Collapse.hpp
+++ b/include/RAJA/policy/sequential/kernel/Collapse.hpp
@@ -32,10 +32,12 @@ namespace internal
 //
 template <typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::Collapse<seq_exec, ArgList<>, EnclosedStmts...>, Types> {
+    statement::Collapse<seq_exec, ArgList<>, EnclosedStmts...>,
+    Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &data)
+  static RAJA_INLINE void exec(Data& data)
   {
     // termination case: no more loops, just execute enclosed statements
     execute_statement_list<camp::list<EnclosedStmts...>, Types>(data);
@@ -47,13 +49,17 @@ struct StatementExecutor<
 // Executor that handles collapsing of an arbitrarily deep set of seq_exec
 // loops
 //
-template <camp::idx_t Arg0, camp::idx_t... ArgRest, typename... EnclosedStmts, typename Types>
-struct StatementExecutor<statement::Collapse<seq_exec,
-                                             ArgList<Arg0, ArgRest...>,
-                                             EnclosedStmts...>, Types> {
+template <camp::idx_t Arg0,
+          camp::idx_t... ArgRest,
+          typename... EnclosedStmts,
+          typename Types>
+struct StatementExecutor<
+    statement::Collapse<seq_exec, ArgList<Arg0, ArgRest...>, EnclosedStmts...>,
+    Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &data)
+  static RAJA_INLINE void exec(Data& data)
   {
 
     // Set the argument type for this loop
@@ -61,11 +67,13 @@ struct StatementExecutor<statement::Collapse<seq_exec,
 
     // compute next-most inner loop Executor
     using next_loop_t = StatementExecutor<
-        statement::Collapse<seq_exec, ArgList<ArgRest...>, EnclosedStmts...>, NewTypes>;
+        statement::Collapse<seq_exec, ArgList<ArgRest...>, EnclosedStmts...>,
+        NewTypes>;
 
     auto len0 = segment_length<Arg0>(data);
 
-    for (auto i0 = 0; i0 < len0; ++i0) {
+    for (auto i0 = 0; i0 < len0; ++i0)
+    {
       data.template assign_offset<Arg0>(i0);
 
       next_loop_t::exec(data);
diff --git a/include/RAJA/policy/sequential/kernel/Reduce.hpp b/include/RAJA/policy/sequential/kernel/Reduce.hpp
index 7280844320..dc94c14d85 100644
--- a/include/RAJA/policy/sequential/kernel/Reduce.hpp
+++ b/include/RAJA/policy/sequential/kernel/Reduce.hpp
@@ -34,10 +34,12 @@ template <template <typename...> class ReduceOperator,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    statement::Reduce<seq_reduce, ReduceOperator, ParamId, EnclosedStmts...>, Types> {
+    statement::Reduce<seq_reduce, ReduceOperator, ParamId, EnclosedStmts...>,
+    Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE void exec(Data&& data)
   {
     // since a sequential reduction is a NOP, and the single thread always
     // has the reduced value, this is just a passthrough to the enclosed
diff --git a/include/RAJA/policy/sequential/launch.hpp b/include/RAJA/policy/sequential/launch.hpp
index a2025a71d5..6459189e23 100644
--- a/include/RAJA/policy/sequential/launch.hpp
+++ b/include/RAJA/policy/sequential/launch.hpp
@@ -26,7 +26,8 @@ namespace RAJA
 {
 
 template <>
-struct LaunchExecute<RAJA::null_launch_t> {
+struct LaunchExecute<RAJA::null_launch_t>
+{
   template <typename BODY>
   static void exec(LaunchContext const& RAJA_UNUSED_ARG(ctx),
                    BODY const& RAJA_UNUSED_ARG(body))
@@ -37,20 +38,25 @@ struct LaunchExecute<RAJA::null_launch_t> {
 
 
 template <>
-struct LaunchExecute<RAJA::seq_launch_t> {
+struct LaunchExecute<RAJA::seq_launch_t>
+{
 
   template <typename BODY, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, LaunchParams const &params, const char *RAJA_UNUSED_ARG(kernel_name),
-       BODY const &body, ReduceParams &RAJA_UNUSED_ARG(ReduceParams))
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       LaunchParams const& params,
+       const char* RAJA_UNUSED_ARG(kernel_name),
+       BODY const& body,
+       ReduceParams& RAJA_UNUSED_ARG(ReduceParams))
   {
 
     LaunchContext ctx;
 
-    char *kernel_local_mem = new char[params.shared_mem_size];
-    ctx.shared_mem_ptr = kernel_local_mem;
+    char* kernel_local_mem = new char[params.shared_mem_size];
+    ctx.shared_mem_ptr     = kernel_local_mem;
 
     body(ctx);
 
@@ -60,18 +66,23 @@ struct LaunchExecute<RAJA::seq_launch_t> {
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  template<typename BODY, typename ReduceParams>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, LaunchParams const &launch_params,
-       const char *RAJA_UNUSED_ARG(kernel_name), BODY const &body, ReduceParams &launch_reducers)
+  template <typename BODY, typename ReduceParams>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       LaunchParams const& launch_params,
+       const char* RAJA_UNUSED_ARG(kernel_name),
+       BODY const& body,
+       ReduceParams& launch_reducers)
   {
     expt::ParamMultiplexer::init<seq_exec>(launch_reducers);
 
     LaunchContext ctx;
-    char *kernel_local_mem = new char[launch_params.shared_mem_size];
-    ctx.shared_mem_ptr = kernel_local_mem;
+    char* kernel_local_mem = new char[launch_params.shared_mem_size];
+    ctx.shared_mem_ptr     = kernel_local_mem;
 
     expt::invoke_body(launch_reducers, body, ctx);
 
@@ -82,54 +93,57 @@ struct LaunchExecute<RAJA::seq_launch_t> {
 
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
 
 template <typename SEGMENT>
-struct LoopExecute<seq_exec, SEGMENT> {
+struct LoopExecute<seq_exec, SEGMENT>
+{
 
   RAJA_SUPPRESS_HD_WARN
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(SEGMENT const& segment,
+                                                BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
-    for (int i = 0; i < len; i++) {
+    for (int i = 0; i < len; i++)
+    {
 
       body(*(segment.begin() + i));
     }
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
-    for (int i = 0; i < len; i++) {
+    for (int i = 0; i < len; i++)
+    {
       body(*(segment.begin() + i));
     }
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
 
     // block stride loop
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int j = 0; j < len1; j++) {
-      for (int i = 0; i < len0; i++) {
+    for (int j = 0; j < len1; j++)
+    {
+      for (int i = 0; i < len0; i++)
+      {
 
         body(*(segment0.begin() + i), *(segment1.begin() + j));
       }
@@ -137,12 +151,12 @@ struct LoopExecute<seq_exec, SEGMENT> {
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
 
     // block stride loop
@@ -150,49 +164,54 @@ struct LoopExecute<seq_exec, SEGMENT> {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int k = 0; k < len2; k++) {
-      for (int j = 0; j < len1; j++) {
-        for (int i = 0; i < len0; i++) {
-          body(*(segment0.begin() + i),
-               *(segment1.begin() + j),
+    for (int k = 0; k < len2; k++)
+    {
+      for (int j = 0; j < len1; j++)
+      {
+        for (int i = 0; i < len0; i++)
+        {
+          body(*(segment0.begin() + i), *(segment1.begin() + j),
                *(segment2.begin() + k));
         }
       }
     }
   }
-
 };
 
 
 template <typename SEGMENT>
-struct LoopICountExecute<seq_exec, SEGMENT> {
+struct LoopICountExecute<seq_exec, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
-    for (int i = 0; i < len; i++) {
+    for (int i = 0; i < len; i++)
+    {
       body(*(segment.begin() + i), i);
     }
   }
 
-    template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
 
     // block stride loop
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int j = 0; j < len1; j++) {
-      for (int i = 0; i < len0; i++) {
+    for (int j = 0; j < len1; j++)
+    {
+      for (int i = 0; i < len0; i++)
+      {
 
         body(*(segment0.begin() + i), *(segment1.begin() + j), i, j);
       }
@@ -200,12 +219,12 @@ struct LoopICountExecute<seq_exec, SEGMENT> {
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
 
     // block stride loop
@@ -213,30 +232,32 @@ struct LoopICountExecute<seq_exec, SEGMENT> {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int k = 0; k < len2; k++) {
-      for (int j = 0; j < len1; j++) {
-        for (int i = 0; i < len0; i++) {
-          body(*(segment0.begin() + i),
-               *(segment1.begin() + j),
+    for (int k = 0; k < len2; k++)
+    {
+      for (int j = 0; j < len1; j++)
+      {
+        for (int i = 0; i < len0; i++)
+        {
+          body(*(segment0.begin() + i), *(segment1.begin() + j),
                *(segment2.begin() + k), i, j, k);
         }
       }
     }
   }
-
 };
 
-//Tile Execute + variants
+// Tile Execute + variants
 
 template <typename SEGMENT>
-struct TileExecute<seq_exec, SEGMENT> {
+struct TileExecute<seq_exec, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_HOST_DEVICE RAJA_INLINE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_HOST_DEVICE RAJA_INLINE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -246,28 +267,27 @@ struct TileExecute<seq_exec, SEGMENT> {
       body(segment.slice(tx, tile_size));
     }
   }
-
 };
 
 template <typename SEGMENT>
-struct TileTCountExecute<seq_exec, SEGMENT> {
+struct TileTCountExecute<seq_exec, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_HOST_DEVICE RAJA_INLINE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_HOST_DEVICE RAJA_INLINE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = 0, bx=0; tx < len; tx += tile_size, bx++)
+    for (int tx = 0, bx = 0; tx < len; tx += tile_size, bx++)
     {
       body(segment.slice(tx, tile_size), bx);
     }
   }
-
 };
 
 }  // namespace RAJA
diff --git a/include/RAJA/policy/sequential/multi_reduce.hpp b/include/RAJA/policy/sequential/multi_reduce.hpp
index be3a3860f8..2b05ba512e 100644
--- a/include/RAJA/policy/sequential/multi_reduce.hpp
+++ b/include/RAJA/policy/sequential/multi_reduce.hpp
@@ -47,7 +47,7 @@ namespace detail
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp, typename tuning >
+template <typename T, typename t_MultiReduceOp, typename tuning>
 struct MultiReduceDataSeq;
 
 /*!
@@ -59,59 +59,68 @@ struct MultiReduceDataSeq;
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp >
-struct MultiReduceDataSeq<T, t_MultiReduceOp,
+template <typename T, typename t_MultiReduceOp>
+struct MultiReduceDataSeq<
+    T,
+    t_MultiReduceOp,
     RAJA::sequential::MultiReduceTuning<
-      RAJA::sequential::multi_reduce_algorithm::left_fold>>
+        RAJA::sequential::multi_reduce_algorithm::left_fold>>
 {
-  using value_type = T;
+  using value_type    = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataSeq() = delete;
 
-  template < typename Container,
-             std::enable_if_t<!std::is_same<Container, MultiReduceDataSeq>::value>* = nullptr >
+  template <typename Container,
+            std::enable_if_t<
+                !std::is_same<Container, MultiReduceDataSeq>::value>* = nullptr>
   MultiReduceDataSeq(Container const& container, T identity)
-      : m_parent(nullptr)
-      , m_num_bins(container.size())
-      , m_identity(identity)
-      , m_data(nullptr)
+      : m_parent(nullptr),
+        m_num_bins(container.size()),
+        m_identity(identity),
+        m_data(nullptr)
   {
     m_data = create_data(container, m_num_bins);
   }
 
-  MultiReduceDataSeq(MultiReduceDataSeq const &other)
-      : m_parent(other.m_parent ? other.m_parent : &other)
-      , m_num_bins(other.m_num_bins)
-      , m_identity(other.m_identity)
-      , m_data(other.m_data)
-  { }
+  MultiReduceDataSeq(MultiReduceDataSeq const& other)
+      : m_parent(other.m_parent ? other.m_parent : &other),
+        m_num_bins(other.m_num_bins),
+        m_identity(other.m_identity),
+        m_data(other.m_data)
+  {}
 
-  MultiReduceDataSeq(MultiReduceDataSeq &&) = delete;
+  MultiReduceDataSeq(MultiReduceDataSeq&&)                 = delete;
   MultiReduceDataSeq& operator=(MultiReduceDataSeq const&) = delete;
-  MultiReduceDataSeq& operator=(MultiReduceDataSeq &&) = delete;
+  MultiReduceDataSeq& operator=(MultiReduceDataSeq&&)      = delete;
 
   ~MultiReduceDataSeq()
   {
-    if (m_data) {
-      if (!m_parent) {
+    if (m_data)
+    {
+      if (!m_parent)
+      {
         destroy_data(m_data, m_num_bins);
       }
     }
   }
 
-  template < typename Container >
+  template <typename Container>
   void reset(Container const& container, T identity)
   {
-    m_identity = identity;
+    m_identity          = identity;
     size_t new_num_bins = container.size();
-    if (new_num_bins != m_num_bins) {
+    if (new_num_bins != m_num_bins)
+    {
       destroy_data(m_data, m_num_bins);
       m_num_bins = new_num_bins;
-      m_data = create_data(container, m_num_bins);
-    } else {
+      m_data     = create_data(container, m_num_bins);
+    }
+    else
+    {
       size_t bin = 0;
-      for (auto const& value : container) {
+      for (auto const& value : container)
+      {
         m_data[bin] = value;
         ++bin;
       }
@@ -122,27 +131,29 @@ struct MultiReduceDataSeq<T, t_MultiReduceOp,
 
   T identity() const { return m_identity; }
 
-  void combine(size_t bin, T const &val) { MultiReduceOp{}(m_data[bin], val); }
+  void combine(size_t bin, T const& val) { MultiReduceOp {}(m_data[bin], val); }
 
   T get(size_t bin) const { return m_data[bin]; }
 
 private:
-  MultiReduceDataSeq const *m_parent;
+  MultiReduceDataSeq const* m_parent;
   size_t m_num_bins;
   T m_identity;
   T* m_data;
 
-  template < typename Container >
+  template <typename Container>
   static T* create_data(Container const& container, size_t num_bins)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return nullptr;
     }
 
-    auto data = static_cast<T*>(malloc(num_bins*sizeof(T)));
+    auto data  = static_cast<T*>(malloc(num_bins * sizeof(T)));
     size_t bin = 0;
-    for (auto const& value : container) {
-      new(&data[bin]) T(value);
+    for (auto const& value : container)
+    {
+      new (&data[bin]) T(value);
       ++bin;
     }
     return data;
@@ -150,11 +161,13 @@ struct MultiReduceDataSeq<T, t_MultiReduceOp,
 
   static void destroy_data(T*& data, size_t num_bins)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return;
     }
 
-    for (size_t bin = 0; bin < num_bins; ++bin) {
+    for (size_t bin = 0; bin < num_bins; ++bin)
+    {
       data[bin].~T();
     }
     free(data);
@@ -164,7 +177,8 @@ struct MultiReduceDataSeq<T, t_MultiReduceOp,
 
 }  // namespace detail
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::sequential::seq_multi_reduce_policy, detail::MultiReduceDataSeq)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::sequential::seq_multi_reduce_policy,
+                                detail::MultiReduceDataSeq)
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/sequential/params/kernel_name.hpp b/include/RAJA/policy/sequential/params/kernel_name.hpp
index 00e6a1dc52..d31f271569 100644
--- a/include/RAJA/policy/sequential/params/kernel_name.hpp
+++ b/include/RAJA/policy/sequential/params/kernel_name.hpp
@@ -3,35 +3,39 @@
 
 #include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
-
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  init(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename T>
-  RAJA_HOST_DEVICE
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  combine(KernelName&, T) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  resolve(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
-
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
-
-
-#endif //  NEW_REDUCE_SEQ_REDUCE_HPP
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+// Init
+template <typename EXEC_POL>
+camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>>
+init(KernelName&)
+{
+  // TODO: Define kernel naming
+}
+
+// Combine
+template <typename EXEC_POL, typename T>
+RAJA_HOST_DEVICE
+    camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>>
+    combine(KernelName&, T)
+{}
+
+// Resolve
+template <typename EXEC_POL>
+camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>>
+resolve(KernelName&)
+{
+  // TODO: Define kernel naming
+}
+
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
+
+
+#endif  //  NEW_REDUCE_SEQ_REDUCE_HPP
diff --git a/include/RAJA/policy/sequential/params/reduce.hpp b/include/RAJA/policy/sequential/params/reduce.hpp
index b77028ca5f..2902418249 100644
--- a/include/RAJA/policy/sequential/params/reduce.hpp
+++ b/include/RAJA/policy/sequential/params/reduce.hpp
@@ -3,33 +3,39 @@
 
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
-  // Init
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  init(Reducer<OP, T, VOp>& red) {
-    red.m_valop.val = OP::identity();
-  }
+// Init
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>>
+init(Reducer<OP, T, VOp>& red)
+{
+  red.m_valop.val = OP::identity();
+}
 
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in) {
-    out.m_valop.val = OP{}(out.m_valop.val, in.m_valop.val);
-  }
+// Combine
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>>
+combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in)
+{
+  out.m_valop.val = OP {}(out.m_valop.val, in.m_valop.val);
+}
 
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  resolve(Reducer<OP, T, VOp>& red) {
-    red.combineTarget(red.m_valop.val);
-  }
+// Resolve
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>>
+resolve(Reducer<OP, T, VOp>& red)
+{
+  red.combineTarget(red.m_valop.val);
+}
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
-#endif //  NEW_REDUCE_SEQ_REDUCE_HPP
+#endif  //  NEW_REDUCE_SEQ_REDUCE_HPP
diff --git a/include/RAJA/policy/sequential/policy.hpp b/include/RAJA/policy/sequential/policy.hpp
index 287af42502..00fa7274a3 100644
--- a/include/RAJA/policy/sequential/policy.hpp
+++ b/include/RAJA/policy/sequential/policy.hpp
@@ -30,7 +30,7 @@ enum struct multi_reduce_algorithm : int
   left_fold
 };
 
-template < multi_reduce_algorithm t_multi_algorithm >
+template <multi_reduce_algorithm t_multi_algorithm>
 struct MultiReduceTuning
 {
   static constexpr multi_reduce_algorithm algorithm = t_multi_algorithm;
@@ -38,7 +38,7 @@ struct MultiReduceTuning
       (algorithm == multi_reduce_algorithm::left_fold);
 };
 
-} // namspace sequential
+}  // namespace sequential
 
 namespace policy
 {
@@ -60,20 +60,20 @@ namespace sequential
 struct seq_region : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                           Pattern::region,
                                                           Launch::sync,
-                                                          Platform::host> {
-};
+                                                          Platform::host>
+{};
 
 struct seq_launch_t : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                             Pattern::region,
                                                             Launch::sync,
-                                                            Platform::host> {
-};
+                                                            Platform::host>
+{};
 
 struct seq_exec : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                         Pattern::forall,
                                                         Launch::undefined,
-                                                        Platform::host> {
-};
+                                                        Platform::host>
+{};
 
 ///
 /// Index set segment iteration policies
@@ -86,8 +86,8 @@ using seq_segit = seq_exec;
 struct seq_work : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                         Pattern::workgroup_exec,
                                                         Launch::sync,
-                                                        Platform::host> {
-};
+                                                        Platform::host>
+{};
 
 ///
 ///////////////////////////////////////////////////////////////////////
@@ -99,20 +99,20 @@ struct seq_work : make_policy_pattern_launch_platform_t<Policy::sequential,
 struct seq_reduce : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                           Pattern::reduce,
                                                           Launch::undefined,
-                                                          Platform::host> {
-};
+                                                          Platform::host>
+{};
 
 ///
-template < typename tuning >
-struct seq_multi_reduce_policy
-    : make_policy_pattern_launch_platform_t<Policy::sequential,
-                                            Pattern::multi_reduce,
-                                            Launch::undefined,
-                                            Platform::host,
-                                            std::conditional_t<tuning::consistent,
-                                                               reduce::ordered,
-                                                               reduce::unordered>> {
-};
+template <typename tuning>
+struct seq_multi_reduce_policy : make_policy_pattern_launch_platform_t<
+                                     Policy::sequential,
+                                     Pattern::multi_reduce,
+                                     Launch::undefined,
+                                     Platform::host,
+                                     std::conditional_t<tuning::consistent,
+                                                        reduce::ordered,
+                                                        reduce::unordered>>
+{};
 
 ///
 ///////////////////////////////////////////////////////////////////////
@@ -121,13 +121,13 @@ struct seq_multi_reduce_policy
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
-struct seq_atomic {
-};
+struct seq_atomic
+{};
 
 
-template < RAJA::sequential::multi_reduce_algorithm algorithm >
-using seq_multi_reduce_tuning = seq_multi_reduce_policy<
-    RAJA::sequential::MultiReduceTuning<algorithm> >;
+template <RAJA::sequential::multi_reduce_algorithm algorithm>
+using seq_multi_reduce_tuning =
+    seq_multi_reduce_policy<RAJA::sequential::MultiReduceTuning<algorithm>>;
 
 // Policies for RAJA::MultiReduce* objects with specific behaviors.
 // - left_fold policies combine new values into a single value.
@@ -143,12 +143,12 @@ using seq_multi_reduce = seq_multi_reduce_left_fold;
 
 using policy::sequential::seq_atomic;
 using policy::sequential::seq_exec;
-using policy::sequential::seq_reduce;
+using policy::sequential::seq_launch_t;
 using policy::sequential::seq_multi_reduce;
+using policy::sequential::seq_reduce;
 using policy::sequential::seq_region;
 using policy::sequential::seq_segit;
 using policy::sequential::seq_work;
-using policy::sequential::seq_launch_t;
 
 
 }  // namespace RAJA
diff --git a/include/RAJA/policy/sequential/region.hpp b/include/RAJA/policy/sequential/region.hpp
index 84d03ae202..81c5d41647 100644
--- a/include/RAJA/policy/sequential/region.hpp
+++ b/include/RAJA/policy/sequential/region.hpp
@@ -35,7 +35,7 @@ namespace sequential
  */
 
 template <typename Func>
-RAJA_INLINE void region_impl(const seq_region &, Func &&body)
+RAJA_INLINE void region_impl(const seq_region&, Func&& body)
 {
   body();
 }
diff --git a/include/RAJA/policy/sequential/scan.hpp b/include/RAJA/policy/sequential/scan.hpp
index 4bcc73366d..e0e12e0a58 100644
--- a/include/RAJA/policy/sequential/scan.hpp
+++ b/include/RAJA/policy/sequential/scan.hpp
@@ -41,22 +41,21 @@ namespace scan
    initial value
 */
 template <typename ExecPolicy, typename Iter, typename BinFn>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_sequential_policy<ExecPolicy>>
-inclusive_inplace(
-    resources::Host host_res,
-    const ExecPolicy &,
-    Iter begin,
-    Iter end,
-    BinFn f)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_sequential_policy<ExecPolicy>>
+inclusive_inplace(resources::Host host_res,
+                  const ExecPolicy&,
+                  Iter begin,
+                  Iter end,
+                  BinFn f)
 {
   using ValueT = typename std::remove_reference<decltype(*begin)>::type;
-  ValueT agg = *begin;
+  ValueT agg   = *begin;
 
-  for (Iter i = ++begin; i != end; ++i) {
+  for (Iter i = ++begin; i != end; ++i)
+  {
     agg = f(agg, *i);
-    *i = agg;
+    *i  = agg;
   }
 
   return resources::EventProxy<resources::Host>(host_res);
@@ -67,28 +66,27 @@ inclusive_inplace(
    initial value
 */
 template <typename ExecPolicy, typename Iter, typename BinFn, typename T>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_sequential_policy<ExecPolicy>>
-exclusive_inplace(
-    resources::Host host_res,
-    const ExecPolicy &,
-    Iter begin,
-    Iter end,
-    BinFn f,
-    T v)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_sequential_policy<ExecPolicy>>
+exclusive_inplace(resources::Host host_res,
+                  const ExecPolicy&,
+                  Iter begin,
+                  Iter end,
+                  BinFn f,
+                  T v)
 {
   using std::distance;
-  const auto n = distance(begin, end);
+  const auto n    = distance(begin, end);
   using DistanceT = typename std::remove_const<decltype(n)>::type;
 
   using ValueT = typename std::remove_reference<decltype(*begin)>::type;
-  ValueT agg = v;
+  ValueT agg   = v;
 
-  for (DistanceT i = 0; i < n; ++i) {
-    auto t = begin[i];
+  for (DistanceT i = 0; i < n; ++i)
+  {
+    auto t   = begin[i];
     begin[i] = agg;
-    agg = f(agg, t);
+    agg      = f(agg, t);
   }
 
   return resources::EventProxy<resources::Host>(host_res);
@@ -99,23 +97,22 @@ exclusive_inplace(
    initial value
 */
 template <typename ExecPolicy, typename Iter, typename OutIter, typename BinFn>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_sequential_policy<ExecPolicy>>
-inclusive(
-    resources::Host host_res,
-    const ExecPolicy &,
-    const Iter begin,
-    const Iter end,
-    OutIter out,
-    BinFn f)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_sequential_policy<ExecPolicy>>
+inclusive(resources::Host host_res,
+          const ExecPolicy&,
+          const Iter begin,
+          const Iter end,
+          OutIter out,
+          BinFn f)
 {
   using ValueT = typename std::remove_reference<decltype(*out)>::type;
-  ValueT agg = *begin;
-  *out++ = agg;
+  ValueT agg   = *begin;
+  *out++       = agg;
 
-  for (Iter i = begin + 1; i != end; ++i) {
-    agg = f(agg, *i);
+  for (Iter i = begin + 1; i != end; ++i)
+  {
+    agg    = f(agg, *i);
     *out++ = agg;
   }
 
@@ -131,26 +128,25 @@ template <typename ExecPolicy,
           typename OutIter,
           typename BinFn,
           typename T>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_sequential_policy<ExecPolicy>>
-exclusive(
-    resources::Host host_res,
-    const ExecPolicy &,
-    const Iter begin,
-    const Iter end,
-    OutIter out,
-    BinFn f,
-    T v)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_sequential_policy<ExecPolicy>>
+exclusive(resources::Host host_res,
+          const ExecPolicy&,
+          const Iter begin,
+          const Iter end,
+          OutIter out,
+          BinFn f,
+          T v)
 {
   using ValueT = typename std::remove_reference<decltype(*out)>::type;
-  ValueT agg = v;
-  OutIter o = out;
-  *o++ = v;
+  ValueT agg   = v;
+  OutIter o    = out;
+  *o++         = v;
 
-  for (Iter i = begin; i != end - 1; ++i, ++o) {
+  for (Iter i = begin; i != end - 1; ++i, ++o)
+  {
     agg = f(agg, *i);
-    *o = agg;
+    *o  = agg;
   }
 
   return resources::EventProxy<resources::Host>(host_res);
diff --git a/include/RAJA/policy/sequential/sort.hpp b/include/RAJA/policy/sequential/sort.hpp
index 98dcf6fc27..0a31400029 100644
--- a/include/RAJA/policy/sequential/sort.hpp
+++ b/include/RAJA/policy/sequential/sort.hpp
@@ -30,7 +30,7 @@
 
 #include "RAJA/util/zip.hpp"
 
-#include "RAJA/util/sort.hpp" 
+#include "RAJA/util/sort.hpp"
 
 #include "RAJA/policy/sequential/policy.hpp"
 
@@ -50,9 +50,8 @@ namespace detail
 */
 struct UnstableSorter
 {
-  template < typename... Args >
-  RAJA_INLINE
-  void operator()(Args&&... args) const
+  template <typename... Args>
+  RAJA_INLINE void operator()(Args&&... args) const
   {
     RAJA::detail::intro_sort(std::forward<Args>(args)...);
   }
@@ -64,15 +63,14 @@ struct UnstableSorter
 */
 struct StableSorter
 {
-  template < typename... Args >
-  RAJA_INLINE
-  void operator()(Args&&... args) const
+  template <typename... Args>
+  RAJA_INLINE void operator()(Args&&... args) const
   {
     RAJA::detail::merge_sort(std::forward<Args>(args)...);
   }
 };
 
-} // namespace detail
+}  // namespace detail
 
 /*!
         \brief sort given range using comparison function
@@ -80,14 +78,13 @@ struct StableSorter
 template <typename ExecPolicy, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_sequential_policy<ExecPolicy>>
-unstable(
-    resources::Host host_res,
-    const ExecPolicy&,
-    Iter begin,
-    Iter end,
-    Compare comp)
+unstable(resources::Host host_res,
+         const ExecPolicy&,
+         Iter begin,
+         Iter end,
+         Compare comp)
 {
-  detail::UnstableSorter{}(begin, end, comp);
+  detail::UnstableSorter {}(begin, end, comp);
 
   return resources::EventProxy<resources::Host>(host_res);
 }
@@ -98,14 +95,13 @@ unstable(
 template <typename ExecPolicy, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_sequential_policy<ExecPolicy>>
-stable(
-    resources::Host host_res,
-    const ExecPolicy&,
-    Iter begin,
-    Iter end,
-    Compare comp)
+stable(resources::Host host_res,
+       const ExecPolicy&,
+       Iter begin,
+       Iter end,
+       Compare comp)
 {
-  detail::StableSorter{}(begin, end, comp);
+  detail::StableSorter {}(begin, end, comp);
 
   return resources::EventProxy<resources::Host>(host_res);
 }
@@ -113,43 +109,48 @@ stable(
 /*!
         \brief sort given range of pairs using comparison function on keys
 */
-template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
+template <typename ExecPolicy,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_sequential_policy<ExecPolicy>>
-unstable_pairs(
-    resources::Host host_res,
-    const ExecPolicy&,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    Compare comp)
+unstable_pairs(resources::Host host_res,
+               const ExecPolicy&,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               Compare comp)
 {
-  auto begin = RAJA::zip(keys_begin, vals_begin);
-  auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+  auto begin    = RAJA::zip(keys_begin, vals_begin);
+  auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
-  detail::UnstableSorter{}(begin, end, RAJA::compare_first<zip_ref>(comp));
+  detail::UnstableSorter {}(begin, end, RAJA::compare_first<zip_ref>(comp));
 
   return resources::EventProxy<resources::Host>(host_res);
 }
 
 /*!
-        \brief stable sort given range of pairs using comparison function on keys
+        \brief stable sort given range of pairs using comparison function on
+   keys
 */
-template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
+template <typename ExecPolicy,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_sequential_policy<ExecPolicy>>
-stable_pairs(
-    resources::Host host_res,
-    const ExecPolicy&,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    Compare comp)
+stable_pairs(resources::Host host_res,
+             const ExecPolicy&,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             Compare comp)
 {
-  auto begin = RAJA::zip(keys_begin, vals_begin);
-  auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+  auto begin    = RAJA::zip(keys_begin, vals_begin);
+  auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
-  detail::StableSorter{}(begin, end, RAJA::compare_first<zip_ref>(comp));
+  detail::StableSorter {}(begin, end, RAJA::compare_first<zip_ref>(comp));
 
   return resources::EventProxy<resources::Host>(host_res);
 }
diff --git a/include/RAJA/policy/simd/forall.hpp b/include/RAJA/policy/simd/forall.hpp
index 8c5b38af9c..851eb2317f 100644
--- a/include/RAJA/policy/simd/forall.hpp
+++ b/include/RAJA/policy/simd/forall.hpp
@@ -48,25 +48,24 @@ namespace simd
 
 
 template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-  >
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(RAJA::resources::Host host_res,
-            const simd_exec &,
-            Iterable &&iter,
-            Func &&loop_body,
+            const simd_exec&,
+            Iterable&& iter,
+            Func&& loop_body,
             ForallParam f_params)
 {
   expt::ParamMultiplexer::init<seq_exec>(f_params);
 
-  auto begin = std::begin(iter);
-  auto end = std::end(iter);
+  auto begin    = std::begin(iter);
+  auto end      = std::end(iter);
   auto distance = std::distance(begin, end);
   RAJA_SIMD
-  for (decltype(distance) i = 0; i < distance; ++i) {
+  for (decltype(distance) i = 0; i < distance; ++i)
+  {
     expt::invoke_body(f_params, loop_body, *(begin + i));
   }
 
@@ -75,23 +74,22 @@ forall_impl(RAJA::resources::Host host_res,
 }
 
 template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  expt::type_traits::is_ForallParamPack<ForallParam>,
-  expt::type_traits::is_ForallParamPack_empty<ForallParam>
-  >
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    expt::type_traits::is_ForallParamPack<ForallParam>,
+    expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(RAJA::resources::Host host_res,
-            const simd_exec &,
-            Iterable &&iter,
-            Func &&loop_body,
+            const simd_exec&,
+            Iterable&& iter,
+            Func&& loop_body,
             ForallParam)
 {
-  auto begin = std::begin(iter);
-  auto end = std::end(iter);
+  auto begin    = std::begin(iter);
+  auto end      = std::end(iter);
   auto distance = std::distance(begin, end);
   RAJA_SIMD
-  for (decltype(distance) i = 0; i < distance; ++i) {
+  for (decltype(distance) i = 0; i < distance; ++i)
+  {
     loop_body(*(begin + i));
   }
 
diff --git a/include/RAJA/policy/simd/kernel/For.hpp b/include/RAJA/policy/simd/kernel/For.hpp
index 53ed45ad1f..ae4e673a15 100644
--- a/include/RAJA/policy/simd/kernel/For.hpp
+++ b/include/RAJA/policy/simd/kernel/For.hpp
@@ -40,12 +40,14 @@ namespace internal
  *
  */
 template <class T>
-struct TypeIsLambda {
+struct TypeIsLambda
+{
   static const bool value = false;
 };
 
-template <camp::idx_t BodyIdx, typename ... Args>
-struct TypeIsLambda<RAJA::statement::Lambda<BodyIdx, Args...>> {
+template <camp::idx_t BodyIdx, typename... Args>
+struct TypeIsLambda<RAJA::statement::Lambda<BodyIdx, Args...>>
+{
   static const bool value = true;
 };
 
@@ -59,10 +61,11 @@ template <typename Types, class... Statements>
 struct Invoke_all_Lambda;
 
 template <typename Types>
-struct Invoke_all_Lambda<Types> {
+struct Invoke_all_Lambda<Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE void lambda_special(Data &&)
+  static RAJA_INLINE void lambda_special(Data&&)
   {
     // NOP terminator
   }
@@ -70,7 +73,8 @@ struct Invoke_all_Lambda<Types> {
 
 
 template <typename Types, class Statement, class... StatementRest>
-struct Invoke_all_Lambda<Types, Statement, StatementRest...> {
+struct Invoke_all_Lambda<Types, Statement, StatementRest...>
+{
 
   // Lambda check
   static const bool value = TypeIsLambda<camp::decay<Statement>>::value;
@@ -78,7 +82,7 @@ struct Invoke_all_Lambda<Types, Statement, StatementRest...> {
 
   // Invoke the chain of lambdas
   template <typename Data>
-  static RAJA_INLINE void lambda_special(Data &&data)
+  static RAJA_INLINE void lambda_special(Data&& data)
   {
 
     // Execute this Lambda
@@ -98,32 +102,36 @@ struct Invoke_all_Lambda<Types, Statement, StatementRest...> {
  */
 template <camp::idx_t ArgumentId, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::For<ArgumentId, RAJA::simd_exec, EnclosedStmts...>, Types> {
+    statement::For<ArgumentId, RAJA::simd_exec, EnclosedStmts...>,
+    Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE void exec(Data&& data)
   {
 
     // Set the argument type for this loop
     using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
-    auto iter = get<ArgumentId>(data.segment_tuple);
-    auto begin = std::begin(iter);
-    auto end = std::end(iter);
+    auto iter     = get<ArgumentId>(data.segment_tuple);
+    auto begin    = std::begin(iter);
+    auto end      = std::end(iter);
     auto distance = std::distance(begin, end);
 
     RAJA_SIMD
-    for (decltype(distance) i = 0; i < distance; ++i) {
+    for (decltype(distance) i = 0; i < distance; ++i)
+    {
 
       // Privatize data for SIMD correctness reasons
       using RAJA::internal::thread_privatize;
-      auto privatizer = thread_privatize(data);
+      auto privatizer    = thread_privatize(data);
       auto& private_data = privatizer.get_priv();
 
       // Assign offset on privatized data
       private_data.template assign_offset<ArgumentId>(i);
 
-      Invoke_all_Lambda<NewTypes, EnclosedStmts...>::lambda_special(private_data);
+      Invoke_all_Lambda<NewTypes, EnclosedStmts...>::lambda_special(
+          private_data);
     }
   }
 };
diff --git a/include/RAJA/policy/simd/kernel/ForICount.hpp b/include/RAJA/policy/simd/kernel/ForICount.hpp
index 36a169f2bf..4544e7ad54 100644
--- a/include/RAJA/policy/simd/kernel/ForICount.hpp
+++ b/include/RAJA/policy/simd/kernel/ForICount.hpp
@@ -42,26 +42,31 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <camp::idx_t ArgumentId, typename ParamId,
-          typename... EnclosedStmts, typename Types>
+template <camp::idx_t ArgumentId,
+          typename ParamId,
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<
-    statement::ForICount<ArgumentId, ParamId, RAJA::simd_exec,
-                         EnclosedStmts...>, Types> {
+    statement::
+        ForICount<ArgumentId, ParamId, RAJA::simd_exec, EnclosedStmts...>,
+    Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE void exec(Data&& data)
   {
 
     // Set the argument type for this loop
     using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
-    auto iter = get<ArgumentId>(data.segment_tuple);
-    auto begin = std::begin(iter);
-    auto end = std::end(iter);
+    auto iter     = get<ArgumentId>(data.segment_tuple);
+    auto begin    = std::begin(iter);
+    auto end      = std::end(iter);
     auto distance = std::distance(begin, end);
 
     RAJA_SIMD
-    for (decltype(distance) i = 0; i < distance; ++i) {
+    for (decltype(distance) i = 0; i < distance; ++i)
+    {
 
       // Offsets and parameters need to be privatized
       data.template assign_offset<ArgumentId>(i);
@@ -69,10 +74,11 @@ struct StatementExecutor<
 
       // Privatize data for SIMD correctness reasons
       using RAJA::internal::thread_privatize;
-      auto privatizer = thread_privatize(data);
+      auto privatizer    = thread_privatize(data);
       auto& private_data = privatizer.get_priv();
 
-      Invoke_all_Lambda<NewTypes, EnclosedStmts...>::lambda_special(private_data);
+      Invoke_all_Lambda<NewTypes, EnclosedStmts...>::lambda_special(
+          private_data);
     }
   }
 };
@@ -81,4 +87,4 @@ struct StatementExecutor<
 }  // end namespace RAJA
 
 
-#endif 
+#endif
diff --git a/include/RAJA/policy/simd/launch.hpp b/include/RAJA/policy/simd/launch.hpp
index 1f8ba01ab3..4ccc94fe94 100644
--- a/include/RAJA/policy/simd/launch.hpp
+++ b/include/RAJA/policy/simd/launch.hpp
@@ -26,36 +26,40 @@ namespace RAJA
 {
 
 template <typename SEGMENT>
-struct LoopExecute<simd_exec, SEGMENT> {
+struct LoopExecute<simd_exec, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
     RAJA_SIMD
-    for (int i = 0; i < len; i++) {
+    for (int i = 0; i < len; i++)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
 template <typename SEGMENT>
-struct LoopICountExecute<simd_exec, SEGMENT> {
+struct LoopICountExecute<simd_exec, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
     RAJA_SIMD
-    for (int i = 0; i < len; i++) {
+    for (int i = 0; i < len; i++)
+    {
       body(*(segment.begin() + i), i);
     }
   }
diff --git a/include/RAJA/policy/simd/policy.hpp b/include/RAJA/policy/simd/policy.hpp
index a85811163f..fac158a36b 100644
--- a/include/RAJA/policy/simd/policy.hpp
+++ b/include/RAJA/policy/simd/policy.hpp
@@ -41,8 +41,8 @@ namespace simd
 struct simd_exec : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                          Pattern::forall,
                                                          Launch::undefined,
-                                                         Platform::host> {
-};
+                                                         Platform::host>
+{};
 
 }  // end of namespace simd
 
diff --git a/include/RAJA/policy/sycl.hpp b/include/RAJA/policy/sycl.hpp
index 491e39910c..81f16d4918 100644
--- a/include/RAJA/policy/sycl.hpp
+++ b/include/RAJA/policy/sycl.hpp
@@ -29,13 +29,13 @@
 #include "RAJA/policy/sycl/forall.hpp"
 #include "RAJA/policy/sycl/policy.hpp"
 #include "RAJA/policy/sycl/reduce.hpp"
-//#include "RAJA/policy/sycl/multi_reduce.hpp"
-//#include "RAJA/policy/sycl/scan.hpp"
-//#include "RAJA/policy/sycl/sort.hpp"
+// #include "RAJA/policy/sycl/multi_reduce.hpp"
+// #include "RAJA/policy/sycl/scan.hpp"
+// #include "RAJA/policy/sycl/sort.hpp"
 #include "RAJA/policy/sycl/kernel.hpp"
-//#include "RAJA/policy/sycl/synchronize.hpp"
+// #include "RAJA/policy/sycl/synchronize.hpp"
 #include "RAJA/policy/sycl/launch.hpp"
-//#include "RAJA/policy/sycl/WorkGroup.hpp"
+// #include "RAJA/policy/sycl/WorkGroup.hpp"
 
 #endif  // closing endif for if defined(RAJA_ENABLE_SYCL)
 
diff --git a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
index 081a88dc23..e1c6cbc884 100644
--- a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
+++ b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
@@ -47,10 +47,11 @@ namespace detail
 {
 
 //! struct containing data necessary to coordinate kernel launches with reducers
-struct syclInfo {
-  sycl_dim_t gridDim{0};
-  sycl_dim_t blockDim{0};
-  cl::sycl::queue qu = cl::sycl::queue();
+struct syclInfo
+{
+  sycl_dim_t gridDim {0};
+  sycl_dim_t blockDim {0};
+  cl::sycl::queue qu  = cl::sycl::queue();
   bool setup_reducers = false;
 #if defined(RAJA_ENABLE_OPENMP)
   syclInfo* thread_states = nullptr;
@@ -67,14 +68,15 @@ extern std::unordered_map<cl::sycl::queue, bool> g_queue_info_map;
 }  // namespace detail
 
 //! Allocator for pinned memory for use in basic_mempool
-struct PinnedAllocator {
+struct PinnedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
   {
     void* ptr;
     ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
-    ptr = ::sycl::malloc_host(nbytes, *q);
+    ptr              = ::sycl::malloc_host(nbytes, *q);
     return ptr;
   }
 
@@ -89,14 +91,15 @@ struct PinnedAllocator {
 };
 
 //! Allocator for device memory for use in basic_mempool
-struct DeviceAllocator {
+struct DeviceAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
   {
     void* ptr;
     ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
-    ptr = ::sycl::malloc_device(nbytes, *q);
+    ptr              = ::sycl::malloc_device(nbytes, *q);
     return ptr;
   }
 
@@ -112,14 +115,15 @@ struct DeviceAllocator {
 
 //! Allocator for pre-zeroed device memory for use in basic_mempool
 //  Note: Memory must be zero when returned to mempool
-struct DeviceZeroedAllocator {
+struct DeviceZeroedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
   {
     void* ptr;
     ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
-    ptr = ::sycl::malloc_device(nbytes, *q);
+    ptr              = ::sycl::malloc_device(nbytes, *q);
     q->memset(ptr, 0, nbytes);
     return ptr;
   }
@@ -146,4 +150,3 @@ using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
 #endif  // closing endif for RAJA_ENABLE_SYCL
 
 #endif  // closing endif for header file include guard
-
diff --git a/include/RAJA/policy/sycl/forall.hpp b/include/RAJA/policy/sycl/forall.hpp
index 901cc694f0..0232c1270d 100644
--- a/include/RAJA/policy/sycl/forall.hpp
+++ b/include/RAJA/policy/sycl/forall.hpp
@@ -85,14 +85,18 @@ ::sycl::range<1> getGridDim(size_t len, size_t block_size)
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async, typename ForallParam,
-          typename std::enable_if<std::is_trivially_copyable<LoopBody>{},bool>::type = true>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Sycl>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(resources::Sycl &sycl_res,
+template <typename Iterable,
+          typename LoopBody,
+          size_t BlockSize,
+          bool Async,
+          typename ForallParam,
+          typename std::enable_if<std::is_trivially_copyable<LoopBody> {},
+                                  bool>::type = true>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Sycl>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+forall_impl(resources::Sycl& sycl_res,
             sycl_exec<BlockSize, Async>,
             Iterable&& iter,
             LoopBody&& loop_body,
@@ -101,17 +105,19 @@ forall_impl(resources::Sycl &sycl_res,
 
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0 && BlockSize > 0) {
+  if (len > 0 && BlockSize > 0)
+  {
     // Note: We could fix an incorrect workgroup size.
     //       It would change what was specified.
     //       For now, leave the device compiler to error with invalid WG size.
@@ -119,33 +125,43 @@ forall_impl(resources::Sycl &sycl_res,
     //
     // Compute the number of blocks
     //
-    sycl_dim_t blockSize{BlockSize};
+    sycl_dim_t blockSize {BlockSize};
     sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
 
     ::sycl::queue* q = sycl_res.get_queue();
 
-    q->submit([&](::sycl::handler& h) {
-
-      h.parallel_for( ::sycl::nd_range<1>{gridSize, blockSize},
-                      [=]  (::sycl::nd_item<1> it) {
-
-        IndexType ii = it.get_global_id(0);
-        if (ii < len) {
-          loop_body(begin[ii]);
-        }
-      });
-    });
-
-    if (!Async) { q->wait(); }
+    q->submit(
+        [&](::sycl::handler& h)
+        {
+          h.parallel_for(::sycl::nd_range<1> {gridSize, blockSize},
+                         [=](::sycl::nd_item<1> it)
+                         {
+                           IndexType ii = it.get_global_id(0);
+                           if (ii < len)
+                           {
+                             loop_body(begin[ii]);
+                           }
+                         });
+        });
+
+    if (!Async)
+    {
+      q->wait();
+    }
   }
 
   return resources::EventProxy<resources::Sycl>(sycl_res);
 }
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async, typename ForallParam,
-          typename std::enable_if<!std::is_trivially_copyable<LoopBody>{},bool>::type = true>
-RAJA_INLINE 
-resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &sycl_res,
+template <typename Iterable,
+          typename LoopBody,
+          size_t BlockSize,
+          bool Async,
+          typename ForallParam,
+          typename std::enable_if<!std::is_trivially_copyable<LoopBody> {},
+                                  bool>::type = true>
+RAJA_INLINE resources::EventProxy<resources::Sycl>
+forall_impl(resources::Sycl& sycl_res,
             sycl_exec<BlockSize, Async>,
             Iterable&& iter,
             LoopBody&& loop_body,
@@ -153,17 +169,19 @@ resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &sycl_res,
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0 && BlockSize > 0) {
+  if (len > 0 && BlockSize > 0)
+  {
 
     // Note: We could fix an incorrect workgroup size.
     //       It would change what was specified.
@@ -172,7 +190,7 @@ resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &sycl_res,
     //
     // Compute the number of blocks
     //
-    sycl_dim_t blockSize{BlockSize};
+    sycl_dim_t blockSize {BlockSize};
     sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
 
     ::sycl::queue* q = sycl_res.get_queue();
@@ -186,24 +204,27 @@ resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &sycl_res,
     // Kernel body is nontrivially copyable, create space on device and copy to
     // Workaround until "is_device_copyable" is supported
     //
-    lbody = (LOOP_BODY*) ::sycl::malloc_device(sizeof(LOOP_BODY), *q);
+    lbody = (LOOP_BODY*)::sycl::malloc_device(sizeof(LOOP_BODY), *q);
     q->memcpy(lbody, &loop_body, sizeof(LOOP_BODY)).wait();
 
-    beg = (Iterator*) ::sycl::malloc_device(sizeof(Iterator), *q);
+    beg = (Iterator*)::sycl::malloc_device(sizeof(Iterator), *q);
     q->memcpy(beg, &begin, sizeof(Iterator)).wait();
 
-    q->submit([&](::sycl::handler& h) {
-
-      h.parallel_for( ::sycl::nd_range<1>{gridSize, blockSize},
-                      [=]  (::sycl::nd_item<1> it) {
-
-        Index_type ii = it.get_global_id(0);
-
-        if (ii < len) {
-          (*lbody)((*beg)[ii]);
-        }
-      });
-    }).wait(); // Need to wait for completion to free memory
+    q->submit(
+         [&](::sycl::handler& h)
+         {
+           h.parallel_for(::sycl::nd_range<1> {gridSize, blockSize},
+                          [=](::sycl::nd_item<1> it)
+                          {
+                            Index_type ii = it.get_global_id(0);
+
+                            if (ii < len)
+                            {
+                              (*lbody)((*beg)[ii]);
+                            }
+                          });
+         })
+        .wait();  // Need to wait for completion to free memory
 
     // Free our device memory
     cl::sycl::free(lbody, *q);
@@ -215,14 +236,19 @@ resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &sycl_res,
   return resources::EventProxy<resources::Sycl>(sycl_res);
 }
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async, typename ForallParam,
-          typename std::enable_if<std::is_trivially_copyable<LoopBody>{},bool>::type = true>
-RAJA_INLINE
-concepts::enable_if_t< 
-  resources::EventProxy<resources::Sycl>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
-forall_impl(resources::Sycl &sycl_res,
+template <typename Iterable,
+          typename LoopBody,
+          size_t BlockSize,
+          bool Async,
+          typename ForallParam,
+          typename std::enable_if<std::is_trivially_copyable<LoopBody> {},
+                                  bool>::type = true>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Sycl>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+forall_impl(resources::Sycl& sycl_res,
             sycl_exec<BlockSize, Async>,
             Iterable&& iter,
             LoopBody&& loop_body,
@@ -231,70 +257,79 @@ forall_impl(resources::Sycl &sycl_res,
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
   using EXEC_POL = RAJA::sycl_exec<BlockSize, Async>;
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0 && BlockSize > 0) {
+  if (len > 0 && BlockSize > 0)
+  {
 
     //
     // Compute the number of blocks
     //
-    sycl_dim_t blockSize{BlockSize};
+    sycl_dim_t blockSize {BlockSize};
     sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
 
     ::sycl::queue* q = sycl_res.get_queue();
 
-    auto combiner = []( ForallParam x, ForallParam y ) {
-      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( x, y );
+    auto combiner = [](ForallParam x, ForallParam y)
+    {
+      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(x, y);
       return x;
     };
 
-    ForallParam* res = ::sycl::malloc_shared<ForallParam>(1,*q);
+    ForallParam* res = ::sycl::malloc_shared<ForallParam>(1, *q);
     RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
     auto reduction = ::sycl::reduction(res, f_params, combiner);
 
-    q->submit([&](::sycl::handler& h) {
-      h.parallel_for( ::sycl::range<1>(len),
-                      reduction,
-                      [=]   (::sycl::item<1> it, auto & red)  {
-
-        ForallParam fp;
-	RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
-        IndexType ii = it.get_id(0);
-        if (ii < len) {
-          RAJA::expt::invoke_body(fp, loop_body, begin[ii]);
-        }
-        red.combine(fp);
-      });
-    });
+    q->submit(
+        [&](::sycl::handler& h)
+        {
+          h.parallel_for(::sycl::range<1>(len), reduction,
+                         [=](::sycl::item<1> it, auto& red)
+                         {
+                           ForallParam fp;
+                           RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+                           IndexType ii = it.get_id(0);
+                           if (ii < len)
+                           {
+                             RAJA::expt::invoke_body(fp, loop_body, begin[ii]);
+                           }
+                           red.combine(fp);
+                         });
+        });
 
     q->wait();
-    RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( f_params, *res );
+    RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params, *res);
     ::sycl::free(res, *q);
   }
   RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
 
   return resources::EventProxy<resources::Sycl>(sycl_res);
-
 }
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async, typename ForallParam,
-          typename std::enable_if<!std::is_trivially_copyable<LoopBody>{},bool>::type = true>
-RAJA_INLINE
-concepts::enable_if_t< 
-  resources::EventProxy<resources::Sycl>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
-forall_impl(resources::Sycl &sycl_res,
+template <typename Iterable,
+          typename LoopBody,
+          size_t BlockSize,
+          bool Async,
+          typename ForallParam,
+          typename std::enable_if<!std::is_trivially_copyable<LoopBody> {},
+                                  bool>::type = true>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Sycl>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+forall_impl(resources::Sycl& sycl_res,
             sycl_exec<BlockSize, Async>,
             Iterable&& iter,
             LoopBody&& loop_body,
@@ -303,29 +338,32 @@ forall_impl(resources::Sycl &sycl_res,
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
   using EXEC_POL = RAJA::sycl_exec<BlockSize, Async>;
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0 && BlockSize > 0) {
+  if (len > 0 && BlockSize > 0)
+  {
     //
     // Compute the number of blocks
     //
-    sycl_dim_t blockSize{BlockSize};
+    sycl_dim_t blockSize {BlockSize};
     sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
 
     ::sycl::queue* q = sycl_res.get_queue();
 
-    auto combiner = []( ForallParam x, ForallParam y ) {
-      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( x, y );
+    auto combiner = [](ForallParam x, ForallParam y)
+    {
+      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(x, y);
       return x;
     };
 
@@ -339,45 +377,44 @@ forall_impl(resources::Sycl &sycl_res,
     // Kernel body is nontrivially copyable, create space on device and copy to
     // Workaround until "is_device_copyable" is supported
     //
-    lbody = (LOOP_BODY*) ::sycl::malloc_device(sizeof(LOOP_BODY), *q);
+    lbody = (LOOP_BODY*)::sycl::malloc_device(sizeof(LOOP_BODY), *q);
     q->memcpy(lbody, &loop_body, sizeof(LOOP_BODY)).wait();
 
-    beg = (Iterator*) ::sycl::malloc_device(sizeof(Iterator), *q);
+    beg = (Iterator*)::sycl::malloc_device(sizeof(Iterator), *q);
     q->memcpy(beg, &begin, sizeof(Iterator)).wait();
 
-    ForallParam* res = ::sycl::malloc_shared<ForallParam>(1,*q);
+    ForallParam* res = ::sycl::malloc_shared<ForallParam>(1, *q);
     RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
     auto reduction = ::sycl::reduction(res, f_params, combiner);
 
-    q->submit([&](::sycl::handler& h) {
-      h.parallel_for( ::sycl::range<1>(len),
-                      reduction,
-                      [=]   (::sycl::item<1> it, auto & red)  {
-
-
-        Index_type ii = it.get_id(0);
-        ForallParam fp;
-	RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
-        if (ii < len) {
-          RAJA::expt::invoke_body(fp, *lbody, (*beg)[ii]);
-        }
-        red.combine(fp);
-
-      });
-    }).wait(); // Need to wait for completion to free memory
-    RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( f_params, *res );
+    q->submit(
+         [&](::sycl::handler& h)
+         {
+           h.parallel_for(::sycl::range<1>(len), reduction,
+                          [=](::sycl::item<1> it, auto& red)
+                          {
+                            Index_type ii = it.get_id(0);
+                            ForallParam fp;
+                            RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+                            if (ii < len)
+                            {
+                              RAJA::expt::invoke_body(fp, *lbody, (*beg)[ii]);
+                            }
+                            red.combine(fp);
+                          });
+         })
+        .wait();  // Need to wait for completion to free memory
+    RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params, *res);
     // Free our device memory
     ::sycl::free(res, *q);
     ::sycl::free(lbody, *q);
     ::sycl::free(beg, *q);
 
     RAJA_FT_END;
-
   }
   RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
 
   return resources::EventProxy<resources::Sycl>(sycl_res);
-
 }
 
 
@@ -403,23 +440,23 @@ template <typename LoopBody,
           size_t BlockSize,
           bool Async,
           typename... SegmentTypes>
-RAJA_INLINE resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &r,
-                                                    ExecPolicy<seq_segit, sycl_exec<BlockSize, Async>>,
-                                                    const TypedIndexSet<SegmentTypes...>& iset,
-                                                    LoopBody&& loop_body)
+RAJA_INLINE resources::EventProxy<resources::Sycl>
+forall_impl(resources::Sycl& r,
+            ExecPolicy<seq_segit, sycl_exec<BlockSize, Async>>,
+            const TypedIndexSet<SegmentTypes...>& iset,
+            LoopBody&& loop_body)
 {
   int num_seg = iset.getNumSegments();
-  for (int isi = 0; isi < num_seg; ++isi) {
-    iset.segmentCall(r,
-                     isi,
-                     detail::CallForall(),
-                     sycl_exec<BlockSize, true>(),
+  for (int isi = 0; isi < num_seg; ++isi)
+  {
+    iset.segmentCall(r, isi, detail::CallForall(), sycl_exec<BlockSize, true>(),
                      loop_body);
   }  // iterate over segments of index set
 
-  if ( !Async ) {
+  if (!Async)
+  {
     ::sycl::queue* q = r.get_queue();
-    q->wait(); 
+    q->wait();
   }
 
   return resources::EventProxy<resources::Sycl>(r);
diff --git a/include/RAJA/policy/sycl/kernel.hpp b/include/RAJA/policy/sycl/kernel.hpp
index 641c3a9ef3..803bcd49e0 100644
--- a/include/RAJA/policy/sycl/kernel.hpp
+++ b/include/RAJA/policy/sycl/kernel.hpp
@@ -23,11 +23,11 @@
 #include "RAJA/policy/sycl/kernel/SyclKernel.hpp"
 #include "RAJA/policy/sycl/kernel/For.hpp"
 #include "RAJA/policy/sycl/kernel/ForICount.hpp"
-//#include "RAJA/policy/sycl/kernel/Hyperplane.hpp"
-//#include "RAJA/policy/sycl/kernel/InitLocalMem.hpp"
+// #include "RAJA/policy/sycl/kernel/Hyperplane.hpp"
+// #include "RAJA/policy/sycl/kernel/InitLocalMem.hpp"
 #include "RAJA/policy/sycl/kernel/Lambda.hpp"
-//#include "RAJA/policy/sycl/kernel/Reduce.hpp"
-//#include "RAJA/policy/sycl/kernel/Sync.hpp"
+// #include "RAJA/policy/sycl/kernel/Reduce.hpp"
+// #include "RAJA/policy/sycl/kernel/Sync.hpp"
 #include "RAJA/policy/sycl/kernel/Tile.hpp"
 #include "RAJA/policy/sycl/kernel/TileTCount.hpp"
 #include "RAJA/policy/sycl/kernel/internal.hpp"
diff --git a/include/RAJA/policy/sycl/kernel/Conditional.hpp b/include/RAJA/policy/sycl/kernel/Conditional.hpp
index 9149418518..f7cc487a28 100644
--- a/include/RAJA/policy/sycl/kernel/Conditional.hpp
+++ b/include/RAJA/policy/sycl/kernel/Conditional.hpp
@@ -42,18 +42,18 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<Data,
                              statement::If<Conditional, EnclosedStmts...>,
-                             Types> {
+                             Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = SyclStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
-    if (Conditional::eval(data)) {
+    if (Conditional::eval(data))
+    {
 
       // execute enclosed statements
       enclosed_stmts_t::exec(data, item, thread_active);
@@ -61,10 +61,7 @@ struct SyclStatementExecutor<Data,
   }
 
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
diff --git a/include/RAJA/policy/sycl/kernel/For.hpp b/include/RAJA/policy/sycl/kernel/For.hpp
index d0976b931f..4b95bff6d6 100644
--- a/include/RAJA/policy/sycl/kernel/For.hpp
+++ b/include/RAJA/policy/sycl/kernel/For.hpp
@@ -45,8 +45,11 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::sycl_global_012<Dim, Local_Size>, EnclosedStmts...>,
-    Types> {
+    statement::For<ArgumentId,
+                   RAJA::sycl_global_012<Dim, Local_Size>,
+                   EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -58,38 +61,39 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     auto len = segment_length<ArgumentId>(data);
-    auto i = item.get_global_id(Dim);
+    auto i   = item.get_global_id(Dim);
 
-      // Assign the x thread to the argument
-      data.template assign_offset<ArgumentId>(i);
+    // Assign the x thread to the argument
+    data.template assign_offset<ArgumentId>(i);
 
-      // execute enclosed statements
-      enclosed_stmts_t::exec(data, item, thread_active && (i<len));
+    // execute enclosed statements
+    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     auto len = segment_length<ArgumentId>(data);
 
     // Set Global Space for Dimension and Local Size
     LaunchDims dims;
-    if (Dim == 0) {
+    if (Dim == 0)
+    {
       dims.global.x = len;
-      dims.local.x = Local_Size;
+      dims.local.x  = Local_Size;
     }
-    if (Dim == 1) {
+    if (Dim == 1)
+    {
       dims.global.y = len;
-      dims.local.y = Local_Size;
+      dims.local.y  = Local_Size;
     }
-    if (Dim == 2) {
+    if (Dim == 2)
+    {
       dims.global.z = len;
-      dims.local.z = Local_Size;
+      dims.local.z  = Local_Size;
     }
 
     // combine with enclosed statements
@@ -108,10 +112,12 @@ template <typename Data,
           int Dim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::For<ArgumentId, RAJA::sycl_group_012_direct<Dim>, EnclosedStmts...>,
-    Types> {
+struct SyclStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::sycl_group_012_direct<Dim>,
+                                            EnclosedStmts...>,
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -123,34 +129,35 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     auto len = segment_length<ArgumentId>(data);
-    auto i = item.get_group(Dim);
+    auto i   = item.get_group(Dim);
 
-      // Assign the x thread to the argument
-      data.template assign_offset<ArgumentId>(i);
+    // Assign the x thread to the argument
+    data.template assign_offset<ArgumentId>(i);
 
-      // execute enclosed statements
-      enclosed_stmts_t::exec(data, item, thread_active && (i<len));
+    // execute enclosed statements
+    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     auto len = segment_length<ArgumentId>(data);
 
     // request one block per element in the segment
     LaunchDims dims;
-    if (Dim == 0) {
+    if (Dim == 0)
+    {
       dims.group.x = len;
     }
-    if (Dim == 1) {
+    if (Dim == 1)
+    {
       dims.group.y = len;
     }
-    if (Dim == 2) {
+    if (Dim == 2)
+    {
       dims.group.z = len;
     }
 
@@ -171,10 +178,12 @@ template <typename Data,
           int Dim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::For<ArgumentId, RAJA::sycl_group_012_loop<Dim>, EnclosedStmts...>,
-    Types> {
+struct SyclStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::sycl_group_012_loop<Dim>,
+                                            EnclosedStmts...>,
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -186,14 +195,15 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
-    auto len = segment_length<ArgumentId>(data);
-    auto i0 = item.get_group(Dim);
+    auto len      = segment_length<ArgumentId>(data);
+    auto i0       = item.get_group(Dim);
     auto i_stride = item.get_group_range(Dim);
 
-    for(auto i = i0;i < len;i += i_stride){
+    for (auto i = i0; i < len; i += i_stride)
+    {
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -203,21 +213,22 @@ struct SyclStatementExecutor<
     }
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     auto len = segment_length<ArgumentId>(data);
 
     // request one block per element in the segment
     LaunchDims dims;
-    if (Dim == 0) {
+    if (Dim == 0)
+    {
       dims.group.x = len;
-    } 
-    if (Dim == 1) {
+    }
+    if (Dim == 1)
+    {
       dims.group.y = len;
     }
-    if (Dim == 2) {
+    if (Dim == 2)
+    {
       dims.group.z = len;
     }
 
@@ -237,10 +248,12 @@ template <typename Data,
           int Dim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::For<ArgumentId, RAJA::sycl_local_012_direct<Dim>, EnclosedStmts...>,
-    Types> {
+struct SyclStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::sycl_local_012_direct<Dim>,
+                                            EnclosedStmts...>,
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -252,35 +265,35 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     auto len = segment_length<ArgumentId>(data);
-    auto i = item.get_local_id(Dim);
+    auto i   = item.get_local_id(Dim);
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, item, thread_active && (i<len));
-
+    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     auto len = segment_length<ArgumentId>(data);
 
     // request one block per element in the segment
     LaunchDims dims;
-    if (Dim == 0) {
+    if (Dim == 0)
+    {
       dims.local.x = len;
     }
-    if (Dim == 1) {
+    if (Dim == 1)
+    {
       dims.local.y = len;
     }
-    if (Dim == 2) {
+    if (Dim == 2)
+    {
       dims.local.z = len;
     }
 
@@ -301,10 +314,12 @@ template <typename Data,
           int Dim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::For<ArgumentId, RAJA::sycl_local_012_loop<Dim>, EnclosedStmts...>,
-    Types> {
+struct SyclStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::sycl_local_012_loop<Dim>,
+                                            EnclosedStmts...>,
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -316,15 +331,16 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
-    auto len = segment_length<ArgumentId>(data);
-    auto i0 = item.get_local_id(Dim);
+    auto len      = segment_length<ArgumentId>(data);
+    auto i0       = item.get_local_id(Dim);
     auto i_stride = item.get_local_range(Dim);
-    auto i = i0;
+    auto i        = i0;
 
-    for(; i < len;i += i_stride){
+    for (; i < len; i += i_stride)
+    {
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -333,7 +349,7 @@ struct SyclStatementExecutor<
       enclosed_stmts_t::exec(data, item, thread_active);
     }
     // do we need one more masked iteration?
-    if(i - i0 < len)
+    if (i - i0 < len)
     {
       // execute enclosed statements one more time, but masking them off
       // this is because there's at least one thread that isn't masked off
@@ -342,21 +358,22 @@ struct SyclStatementExecutor<
     }
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     auto len = segment_length<ArgumentId>(data);
 
     // request one block per element in the segment
     LaunchDims dims;
-    if (Dim == 0) {
+    if (Dim == 0)
+    {
       dims.local.x = len;
     }
-    if (Dim == 1) {
+    if (Dim == 1)
+    {
       dims.local.y = len;
     }
-    if (Dim == 2) {
+    if (Dim == 2)
+    {
       dims.local.z = len;
     }
 
@@ -380,7 +397,8 @@ template <typename Data,
 struct SyclStatementExecutor<
     Data,
     statement::For<ArgumentId, RAJA::sycl_exec<Local_Size>, EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -392,13 +410,13 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item)
+  static inline RAJA_DEVICE void exec(Data& data, cl::sycl::nd_item<3> item)
   {
     auto len = segment_length<ArgumentId>(data);
-    auto i = item.get_global_id(0);
+    auto i   = item.get_global_id(0);
 
-    if (i < len) {
+    if (i < len)
+    {
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -409,15 +427,13 @@ struct SyclStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     auto len = segment_length<ArgumentId>(data);
 
     // request one block per element in the segment
     LaunchDims dims;
-    dims.local.x = Local_Size;
+    dims.local.x  = Local_Size;
     dims.global.x = len;
 
     // combine with enclosed statements
@@ -439,7 +455,8 @@ template <typename Data,
 struct SyclStatementExecutor<
     Data,
     statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -451,17 +468,17 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
 
-    using idx_type = camp::decay<decltype(camp::get<ArgumentId>(data.offset_tuple))>;
+    using idx_type =
+        camp::decay<decltype(camp::get<ArgumentId>(data.offset_tuple))>;
 
     idx_type len = segment_length<ArgumentId>(data);
 
-    for(idx_type i = 0;i < len;++ i){
+    for (idx_type i = 0; i < len; ++i)
+    {
       // Assign i to the argument
       data.template assign_offset<ArgumentId>(i);
 
@@ -470,9 +487,7 @@ struct SyclStatementExecutor<
     }
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
@@ -483,4 +498,4 @@ struct SyclStatementExecutor<
 }  // end namespace RAJA
 
 
-#endif 
+#endif
diff --git a/include/RAJA/policy/sycl/kernel/ForICount.hpp b/include/RAJA/policy/sycl/kernel/ForICount.hpp
index 9c25bb0ab9..feb5c195c4 100644
--- a/include/RAJA/policy/sycl/kernel/ForICount.hpp
+++ b/include/RAJA/policy/sycl/kernel/ForICount.hpp
@@ -31,7 +31,6 @@ namespace internal
 {
 
 
-
 /*
  * Executor for local work sharing loop inside SyclKernel.
  * Mapping directly from local id to indices
@@ -46,41 +45,45 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, RAJA::sycl_local_012_direct<ThreadDim>, EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_local_012_direct<ThreadDim>,
+                         EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_local_012_direct<ThreadDim>, EnclosedStmts...>, Types> {
+          Data,
+          statement::For<ArgumentId,
+                         RAJA::sycl_local_012_direct<ThreadDim>,
+                         EnclosedStmts...>,
+          Types>
+{
 
   using Base = SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_local_012_direct<ThreadDim>, EnclosedStmts...>,
-        Types>;
+      Data,
+      statement::For<ArgumentId,
+                     RAJA::sycl_local_012_direct<ThreadDim>,
+                     EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     diff_t len = segment_length<ArgumentId>(data);
-    auto i = item.get_local_id(ThreadDim);
+    auto i     = item.get_local_id(ThreadDim);
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, item, thread_active && (i<len));
-
+    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
   }
 };
 
 
-
-
 /*
  * Executor for local work sharing loop inside SyclKernel.
  * Assigns the loop index to offset ArgumentId
@@ -89,58 +92,59 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::sycl_local_masked_direct<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public SyclStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::sycl_local_masked_direct<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = SyclStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_local_masked_direct<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public SyclStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::sycl_local_masked_direct<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::sycl_local_masked_direct<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::sycl_local_masked_direct<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     diff_t len = segment_length<ArgumentId>(data);
-    auto i0 = item.get_local_id(0);
-    diff_t i = mask_t::maskValue(i0);
+    auto i0    = item.get_local_id(0);
+    diff_t i   = mask_t::maskValue(i0);
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, item, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
   }
-
 };
 
 
-
-
-
 /*
  * Executor for local work sharing loop inside SyclKernel.
  * Assigns the loop index to offset ArgumentId
@@ -149,48 +153,54 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::sycl_local_masked_loop<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public SyclStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::sycl_local_masked_loop<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = SyclStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_local_masked_loop<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public SyclStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::sycl_local_masked_loop<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::sycl_local_masked_loop<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::sycl_local_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // masked size strided loop
-    diff_t len = segment_length<ArgumentId>(data);
-    auto i0 = item.get_local_id(0);
-    diff_t i_init = mask_t::maskValue(i0);
-    diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    diff_t len      = segment_length<ArgumentId>(data);
+    auto i0         = item.get_local_id(0);
+    diff_t i_init   = mask_t::maskValue(i0);
+    diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -205,13 +215,9 @@ struct SyclStatementExecutor<
       enclosed_stmts_t::exec(data, item, thread_active && have_work);
     }
   }
-
 };
 
 
-
-
-
 /*
  * Executor for thread work sharing loop inside SyclKernel.
  * Provides a block-stride loop (stride of blockDim.xyz) for
@@ -227,31 +233,40 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, RAJA::sycl_local_012_loop<ThreadDim>, EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_local_012_loop<ThreadDim>,
+                         EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_local_012_loop<ThreadDim>, EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<ArgumentId,
+                         RAJA::sycl_local_012_loop<ThreadDim>,
+                         EnclosedStmts...>,
+          Types>
+{
 
-  using Base = SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_local_012_loop<ThreadDim>, EnclosedStmts...>,
-        Types>;
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::sycl_local_012_loop<ThreadDim>,
+                                           EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // block stride loop
-    diff_t len = segment_length<ArgumentId>(data);
-    auto i_init = item.get_local_id(ThreadDim);
+    diff_t len    = segment_length<ArgumentId>(data);
+    auto i_init   = item.get_local_id(ThreadDim);
     auto i_stride = item.get_local_range(ThreadDim);
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -269,7 +284,6 @@ struct SyclStatementExecutor<
 };
 
 
-
 /*
  * Executor for group work sharing inside SyclKernel.
  * Provides a direct mapping of each block in 012.
@@ -284,29 +298,38 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, RAJA::sycl_group_012_direct<BlockDim>, EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_group_012_direct<BlockDim>,
+                         EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_group_012_direct<BlockDim>, EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<ArgumentId,
+                         RAJA::sycl_group_012_direct<BlockDim>,
+                         EnclosedStmts...>,
+          Types>
+{
 
   using Base = SyclStatementExecutor<
       Data,
-      statement::For<ArgumentId, RAJA::sycl_group_012_direct<BlockDim>, EnclosedStmts...>,
+      statement::For<ArgumentId,
+                     RAJA::sycl_group_012_direct<BlockDim>,
+                     EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // grid stride loop
     diff_t len = segment_length<ArgumentId>(data);
-    auto i = item.get_group(BlockDim);
+    auto i     = item.get_group(BlockDim);
 
-    if (i < len) {
+    if (i < len)
+    {
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -333,31 +356,40 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, RAJA::sycl_group_012_loop<BlockDim>, EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_group_012_loop<BlockDim>,
+                         EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_group_012_loop<BlockDim>, EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<ArgumentId,
+                         RAJA::sycl_group_012_loop<BlockDim>,
+                         EnclosedStmts...>,
+          Types>
+{
 
-  using Base = SyclStatementExecutor<
-      Data,
-      statement::For<ArgumentId, RAJA::sycl_group_012_loop<BlockDim>, EnclosedStmts...>,
-      Types>;
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::sycl_group_012_loop<BlockDim>,
+                                           EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // grid stride loop
-    diff_t len = segment_length<ArgumentId>(data);
-    auto i_init = item.get_group(BlockDim);
+    diff_t len    = segment_length<ArgumentId>(data);
+    auto i_init   = item.get_group(BlockDim);
     auto i_stride = item.get_group_range(BlockDim);
 
     // Iterate through grid stride of chunks
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -384,26 +416,29 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>, Types >
+    statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>,
+    Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, seq_exec, EnclosedStmts...>, Types > {
+          Data,
+          statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
+          Types>
+{
 
   using Base = SyclStatementExecutor<
       Data,
-      statement::For<ArgumentId, seq_exec, EnclosedStmts...>, Types >;
+      statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     diff_t len = segment_length<ArgumentId>(data);
 
-    for(diff_t i = 0;i < len;++ i){
+    for (diff_t i = 0; i < len; ++i)
+    {
       // Assign i to the argument
       data.template assign_offset<ArgumentId>(i);
       data.template assign_param<ParamId>(i);
@@ -415,9 +450,6 @@ struct SyclStatementExecutor<
 };
 
 
-
-
-
 }  // namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/sycl/kernel/Lambda.hpp b/include/RAJA/policy/sycl/kernel/Lambda.hpp
index 0542f4b81e..8da7a878bb 100644
--- a/include/RAJA/policy/sycl/kernel/Lambda.hpp
+++ b/include/RAJA/policy/sycl/kernel/Lambda.hpp
@@ -42,22 +42,28 @@ namespace internal
 
 // SyclStatementExecutor for actually invoking the lambda
 
-template <typename Data, camp::idx_t LambdaIndex, typename... Args, typename Types>
-struct SyclStatementExecutor<Data, statement::Lambda<LambdaIndex, Args...>, Types> {
+template <typename Data,
+          camp::idx_t LambdaIndex,
+          typename... Args,
+          typename Types>
+struct SyclStatementExecutor<Data,
+                             statement::Lambda<LambdaIndex, Args...>,
+                             Types>
+{
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Only execute the lambda if it hasn't been masked off
-    if(thread_active){
-      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(data);
+    if (thread_active)
+    {
+      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(
+          data);
     }
-
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims
+  calculateDimensions(Data const& RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
diff --git a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
index 88c789c062..d36a7fa2af 100644
--- a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
+++ b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
@@ -49,11 +49,11 @@ namespace RAJA
  */
 template <bool async0>
 struct sycl_launch : public RAJA::make_policy_pattern_launch_platform_t<
-                            RAJA::Policy::sycl,
-                            RAJA::Pattern::forall,
-                            detail::get_launch<async0>::value,
-                            RAJA::Platform::sycl>{
-};
+                         RAJA::Policy::sycl,
+                         RAJA::Pattern::forall,
+                         detail::get_launch<async0>::value,
+                         RAJA::Platform::sycl>
+{};
 
 namespace statement
 {
@@ -63,28 +63,24 @@ namespace statement
  */
 template <typename LaunchConfig, typename... EnclosedStmts>
 struct SyclKernelExt
-    : public internal::Statement<LaunchConfig, EnclosedStmts...> {
-};
+    : public internal::Statement<LaunchConfig, EnclosedStmts...>
+{};
 
 /*
  * A RAJA::kernel statement that launches a SYCL kernel.
  * The kernel launch is synchronous.
  */
 template <typename... EnclosedStmts>
-using SyclKernel =
-    SyclKernelExt<sycl_launch<false>,
-                  EnclosedStmts...>;
+using SyclKernel = SyclKernelExt<sycl_launch<false>, EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a SYCL kernel.
  * The kernel launch is asynchronous.
  */
 template <typename... EnclosedStmts>
-using SyclKernelAsync =
-    SyclKernelExt<sycl_launch<true>,
-                  EnclosedStmts...>;
+using SyclKernelAsync = SyclKernelExt<sycl_launch<true>, EnclosedStmts...>;
 
-} // namespace statement
+}  // namespace statement
 
 namespace internal
 {
@@ -96,7 +92,7 @@ template <typename Data, typename Exec>
 void SyclKernelLauncher(Data data, cl::sycl::nd_item<3> item)
 {
 
-  using data_t = camp::decay<Data>;
+  using data_t        = camp::decay<Data>;
   data_t private_data = data;
 
   // execute the the object
@@ -107,7 +103,11 @@ void SyclKernelLauncher(Data data, cl::sycl::nd_item<3> item)
  * Helper class that handles SYCL kernel launching, and computing
  * maximum number of threads/blocks
  */
-template<bool IsTriviallyCopyable, typename LaunchPolicy, typename StmtList, typename Data, typename Types>
+template <bool IsTriviallyCopyable,
+          typename LaunchPolicy,
+          typename StmtList,
+          typename Data,
+          typename Types>
 struct SyclLaunchHelper;
 
 /*!
@@ -115,17 +115,18 @@ struct SyclLaunchHelper;
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the SYCL occupancy calculator.
  */
-template<bool async0, typename StmtList, typename Data, typename Types>
-struct SyclLaunchHelper<false,sycl_launch<async0>,StmtList,Data,Types>
+template <bool async0, typename StmtList, typename Data, typename Types>
+struct SyclLaunchHelper<false, sycl_launch<async0>, StmtList, Data, Types>
 {
   using Self = SyclLaunchHelper;
 
   static constexpr bool async = async0;
 
-  using executor_t = internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
+  using executor_t =
+      internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
   using data_t = camp::decay<Data>;
 
-  static void launch(Data &&data,
+  static void launch(Data&& data,
                      internal::LaunchDims launch_dims,
                      size_t shmem,
                      cl::sycl::queue* qu)
@@ -136,21 +137,19 @@ struct SyclLaunchHelper<false,sycl_launch<async0>,StmtList,Data,Types>
     // Kernel body is nontrivially copyable, create space on device and copy to
     // Workaround until "is_device_copyable" is supported
     //
-    data_t* m_data = (data_t*) cl::sycl::malloc_device(sizeof(data_t), *qu);
+    data_t* m_data = (data_t*)cl::sycl::malloc_device(sizeof(data_t), *qu);
     qu->memcpy(m_data, &data, sizeof(data_t)).wait();
 
-    qu->submit([&](cl::sycl::handler& h) {
- 
-      h.parallel_for(launch_dims.fit_nd_range(qu),
-                     [=] (cl::sycl::nd_item<3> item) {
-        
-        SyclKernelLauncher<Data, executor_t>(*m_data, item);
-
-      });
-    }).wait(); // Need to wait to free memory
+    qu->submit(
+          [&](cl::sycl::handler& h)
+          {
+            h.parallel_for(
+                launch_dims.fit_nd_range(qu), [=](cl::sycl::nd_item<3> item)
+                { SyclKernelLauncher<Data, executor_t>(*m_data, item); });
+          })
+        .wait();  // Need to wait to free memory
 
     cl::sycl::free(m_data, *qu);
-
   }
 };
 
@@ -159,34 +158,35 @@ struct SyclLaunchHelper<false,sycl_launch<async0>,StmtList,Data,Types>
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the SYCL occupancy calculator.
  */
-template<bool async0, typename StmtList, typename Data, typename Types>
-struct SyclLaunchHelper<true,sycl_launch<async0>,StmtList,Data,Types>
+template <bool async0, typename StmtList, typename Data, typename Types>
+struct SyclLaunchHelper<true, sycl_launch<async0>, StmtList, Data, Types>
 {
   using Self = SyclLaunchHelper;
 
   static constexpr bool async = async0;
 
-  using executor_t = internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
+  using executor_t =
+      internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
   using data_t = camp::decay<Data>;
 
-  static void launch(Data &&data,
+  static void launch(Data&& data,
                      internal::LaunchDims launch_dims,
                      size_t shmem,
                      cl::sycl::queue* qu)
   {
 
-    qu->submit([&](cl::sycl::handler& h) {
- 
-      h.parallel_for(launch_dims.fit_nd_range(qu),
-                     [=] (cl::sycl::nd_item<3> item) {
-
-        SyclKernelLauncher<Data, executor_t>(data, item);
-
-      });
-    });
-
-    if (!async) { qu->wait(); };
-
+    qu->submit(
+        [&](cl::sycl::handler& h)
+        {
+          h.parallel_for(launch_dims.fit_nd_range(qu),
+                         [=](cl::sycl::nd_item<3> item)
+                         { SyclKernelLauncher<Data, executor_t>(data, item); });
+        });
+
+    if (!async)
+    {
+      qu->wait();
+    };
   }
 };
 
@@ -195,38 +195,40 @@ struct SyclLaunchHelper<true,sycl_launch<async0>,StmtList,Data,Types>
  */
 template <typename LaunchConfig, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::SyclKernelExt<LaunchConfig, EnclosedStmts...>, Types> {
+    statement::SyclKernelExt<LaunchConfig, EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
   using StatementType =
       statement::SyclKernelExt<LaunchConfig, EnclosedStmts...>;
 
   template <typename Data>
-  static inline void exec(Data &&data)
+  static inline void exec(Data&& data)
   {
 
     using data_t = camp::decay<Data>;
-    using executor_t = sycl_statement_list_executor_t<stmt_list_t, data_t, Types>;
+    using executor_t =
+        sycl_statement_list_executor_t<stmt_list_t, data_t, Types>;
     using launch_t = SyclLaunchHelper<std::is_trivially_copyable<data_t>::value,
                                       LaunchConfig, stmt_list_t, data_t, Types>;
 
     camp::resources::Sycl res = data.get_resource();
-    ::sycl::queue* q = res.get_queue();;
+    ::sycl::queue* q          = res.get_queue();
+    ;
 
     //
     // Compute the requested kernel dimensions
     //
     LaunchDims launch_dims = executor_t::calculateDimensions(data);
-    
+
     int shmem = 0;
 
     //
     // Launch the kernels
     //
     launch_t::launch(std::move(data), launch_dims, shmem, q);
-
   }
-
 };
 
 
diff --git a/include/RAJA/policy/sycl/kernel/Tile.hpp b/include/RAJA/policy/sycl/kernel/Tile.hpp
index 81a57cdecb..ee4c78a273 100644
--- a/include/RAJA/policy/sycl/kernel/Tile.hpp
+++ b/include/RAJA/policy/sycl/kernel/Tile.hpp
@@ -1,12 +1,12 @@
- /*!
- ******************************************************************************
- *
- * \file
- *
- * \brief   Header file for SYCL tiled executors.
- *
- ******************************************************************************
- */
+/*!
+******************************************************************************
+*
+* \file
+*
+* \brief   Header file for SYCL tiled executors.
+*
+******************************************************************************
+*/
 
 
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
@@ -54,19 +54,22 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>
+    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
+    Types>
 {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = SyclStatementListExecutor<Data, stmt_list_t, Types>;
-  using diff_t = segment_diff_type<ArgumentId, Data>;
+  using diff_t           = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active){
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     diff_t chunk_size = TPol::chunk_size;
@@ -75,7 +78,8 @@ struct SyclStatementExecutor<
     diff_t len = segment.end() - segment.begin();
 
     // Iterate through tiles
-    for (diff_t i = 0; i < len; i += chunk_size) {
+    for (diff_t i = 0; i < len; i += chunk_size)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, chunk_size);
@@ -89,17 +93,15 @@ struct SyclStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, TPol::chunk_size);
@@ -124,14 +126,13 @@ template <typename Data,
           int BlockDim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_group_012_direct<BlockDim>,
-                    EnclosedStmts...>,
-                    Types>
-  {
+struct SyclStatementExecutor<Data,
+                             statement::Tile<ArgumentId,
+                                             RAJA::tile_fixed<chunk_size>,
+                                             sycl_group_012_direct<BlockDim>,
+                                             EnclosedStmts...>,
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -139,20 +140,24 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
     diff_t len = segment.end() - segment.begin();
-    //diff_t i = get_sycl_dim<BlockDim>(blockIdx) * chunk_size; // TODO
-    diff_t i = item.get_group(BlockDim) * chunk_size;//get_sycl_dim<BlockDim>(blockIdx) * chunk_size; // TODO
+    // diff_t i = get_sycl_dim<BlockDim>(blockIdx) * chunk_size; // TODO
+    diff_t i =
+        item.get_group(BlockDim) *
+        chunk_size;  // get_sycl_dim<BlockDim>(blockIdx) * chunk_size; // TODO
 
     // check have chunk
-    if (i < len) {
+    if (i < len)
+    {
 
       // Keep copy of original segment, so we can restore it
       segment_t orig_segment = segment;
@@ -169,15 +174,14 @@ struct SyclStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
 
     // Compute how many blocks
-    diff_t len = segment_length<ArgumentId>(data);
+    diff_t len        = segment_length<ArgumentId>(data);
     diff_t num_blocks = len / chunk_size;
-    if (num_blocks * chunk_size < len) {
+    if (num_blocks * chunk_size < len)
+    {
       num_blocks++;
     }
 
@@ -189,11 +193,11 @@ struct SyclStatementExecutor<
 
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -217,13 +221,13 @@ template <typename Data,
           int BlockDim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_group_012_loop<BlockDim>,
-                    EnclosedStmts...>, Types>
-  {
+struct SyclStatementExecutor<Data,
+                             statement::Tile<ArgumentId,
+                                             RAJA::tile_fixed<chunk_size>,
+                                             sycl_group_012_loop<BlockDim>,
+                                             EnclosedStmts...>,
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -231,22 +235,24 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    diff_t len = segment.end() - segment.begin();
-    diff_t i_init = item.get_group(BlockDim) * chunk_size; // TODO
-    diff_t i_stride = item.get_group_range(BlockDim) * chunk_size; // TODO
+    diff_t len      = segment.end() - segment.begin();
+    diff_t i_init   = item.get_group(BlockDim) * chunk_size;        // TODO
+    diff_t i_stride = item.get_group_range(BlockDim) * chunk_size;  // TODO
 
     // Iterate through grid stride of chunks
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, chunk_size);
@@ -260,15 +266,14 @@ struct SyclStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
 
     // Compute how many blocks
-    diff_t len = segment_length<ArgumentId>(data);
+    diff_t len        = segment_length<ArgumentId>(data);
     diff_t num_blocks = len / chunk_size;
-    if (num_blocks * chunk_size < len) {
+    if (num_blocks * chunk_size < len)
+    {
       num_blocks++;
     }
 
@@ -276,13 +281,12 @@ struct SyclStatementExecutor<
     set_sycl_dim<BlockDim>(dims.group, num_blocks);
 
 
-
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -296,7 +300,6 @@ struct SyclStatementExecutor<
 };
 
 
-
 /*!
  * A specialized RAJA::kernel sycl_impl executor for statement::Tile
  * Assigns the tile segment to segment ArgumentId
@@ -306,33 +309,35 @@ template <typename Data,
           camp::idx_t ArgumentId,
           camp::idx_t chunk_size,
           int ThreadDim,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-  Data,
-  statement::Tile<ArgumentId,
-                  RAJA::tile_fixed<chunk_size>,
-                  sycl_local_012_direct<ThreadDim>,
-                  EnclosedStmts ...>, Types>{
+struct SyclStatementExecutor<Data,
+                             statement::Tile<ArgumentId,
+                                             RAJA::tile_fixed<chunk_size>,
+                                             sycl_local_012_direct<ThreadDim>,
+                                             EnclosedStmts...>,
+                             Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = SyclStatementListExecutor<Data, stmt_list_t, Types>;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
     diff_t len = segment.end() - segment.begin();
-    diff_t i = item.get_local_id(ThreadDim) * chunk_size;
+    diff_t i   = item.get_local_id(ThreadDim) * chunk_size;
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -340,7 +345,7 @@ struct SyclStatementExecutor<
 
     // Assign our new tiled segment
     diff_t slice_size = have_work ? chunk_size : 0;
-    segment = orig_segment.slice(i, slice_size);
+    segment           = orig_segment.slice(i, slice_size);
 
     // execute enclosed statements
     enclosed_stmts_t::exec(data, item, thread_active && have_work);
@@ -350,15 +355,14 @@ struct SyclStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
 
     // Compute how many blocks
-    diff_t len = segment_length<ArgumentId>(data);
+    diff_t len         = segment_length<ArgumentId>(data);
     diff_t num_threads = len / chunk_size;
-    if(num_threads * chunk_size < len){
+    if (num_threads * chunk_size < len)
+    {
       num_threads++;
     }
 
@@ -367,20 +371,20 @@ struct SyclStatementExecutor<
     set_sycl_dim<ThreadDim>(dims.min_locals, num_threads);
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
 
 
     LaunchDims enclosed_dims =
-      enclosed_stmts_t::calculateDimensions(private_data);
+        enclosed_stmts_t::calculateDimensions(private_data);
 
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
@@ -394,37 +398,40 @@ template <typename Data,
           camp::idx_t ArgumentId,
           camp::idx_t chunk_size,
           int ThreadDim,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-  Data,
-  statement::Tile<ArgumentId,
-                  RAJA::tile_fixed<chunk_size>,
-                  sycl_local_012_loop<ThreadDim>,
-                  EnclosedStmts ...>, Types>{
+struct SyclStatementExecutor<Data,
+                             statement::Tile<ArgumentId,
+                                             RAJA::tile_fixed<chunk_size>,
+                                             sycl_local_012_loop<ThreadDim>,
+                                             EnclosedStmts...>,
+                             Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = SyclStatementListExecutor<Data, stmt_list_t, Types>;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    diff_t len = segment_length<ArgumentId>(data);
-    diff_t i_init = item.get_local_id(ThreadDim) * chunk_size;
+    diff_t len      = segment_length<ArgumentId>(data);
+    diff_t i_init   = item.get_local_id(ThreadDim) * chunk_size;
     diff_t i_stride = item.get_group_range(ThreadDim) * chunk_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -433,7 +440,7 @@ struct SyclStatementExecutor<
 
       // Assign our new tiled segment
       diff_t slice_size = have_work ? chunk_size : 0;
-      segment = orig_segment.slice(i, slice_size);
+      segment           = orig_segment.slice(i, slice_size);
 
       // execute enclosed statements
       enclosed_stmts_t::exec(data, item, thread_active && have_work);
@@ -444,15 +451,14 @@ struct SyclStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
 
     // Compute how many blocks
-    diff_t len = segment_length<ArgumentId>(data);
+    diff_t len         = segment_length<ArgumentId>(data);
     diff_t num_threads = len / chunk_size;
-    if(num_threads * chunk_size < len){
+    if (num_threads * chunk_size < len)
+    {
       num_threads++;
     }
     num_threads = std::max(num_threads, (diff_t)1);
@@ -462,26 +468,24 @@ struct SyclStatementExecutor<
     set_sycl_dim<ThreadDim>(dims.min_locals, 1);
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
 
 
     LaunchDims enclosed_dims =
-      enclosed_stmts_t::calculateDimensions(private_data);
+        enclosed_stmts_t::calculateDimensions(private_data);
 
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
 
-
-
 }  // end namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/sycl/kernel/TileTCount.hpp b/include/RAJA/policy/sycl/kernel/TileTCount.hpp
index b1d263a263..8f1caf75c0 100644
--- a/include/RAJA/policy/sycl/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/sycl/kernel/TileTCount.hpp
@@ -55,27 +55,31 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>, Types>
+    statement::
+        TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>,
+    Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types> {
+          Data,
+          statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
+          Types>
+{
 
   using Base = SyclStatementExecutor<
       Data,
-      statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>;
+      statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active){
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     diff_t chunk_size = TPol::chunk_size;
@@ -84,7 +88,8 @@ struct SyclStatementExecutor<
     diff_t len = segment.end() - segment.begin();
 
     // Iterate through tiles
-    for (diff_t i = 0, t = 0; i < len; i += chunk_size, ++t) {
+    for (diff_t i = 0, t = 0; i < len; i += chunk_size, ++t)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, chunk_size);
@@ -114,48 +119,49 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_group_012_direct<BlockDim>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<ArgumentId,
+                          ParamId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_group_012_direct<BlockDim>,
+                          EnclosedStmts...>,
+    Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        sycl_group_012_direct<BlockDim>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<ArgumentId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_group_012_direct<BlockDim>,
+                          EnclosedStmts...>,
+          Types>
+{
 
-  using Base = SyclStatementExecutor<
-      Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      sycl_group_012_direct<BlockDim>,
-                      EnclosedStmts...>,
-                      Types>;
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::Tile<ArgumentId,
+                                            RAJA::tile_fixed<chunk_size>,
+                                            sycl_group_012_direct<BlockDim>,
+                                            EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
     diff_t len = segment.end() - segment.begin();
-    //diff_t t = get_sycl_dim<BlockDim>(blockIdx);
+    // diff_t t = get_sycl_dim<BlockDim>(blockIdx);
     diff_t t = item.get_group(BlockDim);
     diff_t i = t * chunk_size;
 
     // check have a chunk
-    if (i < len) {
+    if (i < len)
+    {
 
       // Keep copy of original segment, so we can restore it
       segment_t orig_segment = segment;
@@ -187,51 +193,52 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_group_012_loop<BlockDim>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<ArgumentId,
+                          ParamId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_group_012_loop<BlockDim>,
+                          EnclosedStmts...>,
+    Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        sycl_group_012_loop<BlockDim>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<ArgumentId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_group_012_loop<BlockDim>,
+                          EnclosedStmts...>,
+          Types>
+{
 
-  using Base = SyclStatementExecutor<
-      Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      sycl_group_012_loop<BlockDim>,
-                      EnclosedStmts...>,
-                      Types>;
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::Tile<ArgumentId,
+                                            RAJA::tile_fixed<chunk_size>,
+                                            sycl_group_012_loop<BlockDim>,
+                                            EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    diff_t len = segment.end() - segment.begin();
-    diff_t t_init = item.get_group(BlockDim);
-    diff_t i_init = t_init * chunk_size;
+    diff_t len      = segment.end() - segment.begin();
+    diff_t t_init   = item.get_group(BlockDim);
+    diff_t i_init   = t_init * chunk_size;
     diff_t t_stride = item.get_group_range(BlockDim);
     diff_t i_stride = t_stride * chunk_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
+    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, chunk_size);
@@ -247,7 +254,6 @@ struct SyclStatementExecutor<
 };
 
 
-
 /*!
  * A specialized RAJA::kernel sycl_impl executor for statement::TileTCount
  * Assigns the tile segment to segment ArgumentId
@@ -258,49 +264,49 @@ template <typename Data,
           typename ParamId,
           camp::idx_t chunk_size,
           int ThreadDim,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<
-  Data,
-  statement::TileTCount<ArgumentId, ParamId,
-                        RAJA::tile_fixed<chunk_size>,
-                        sycl_local_012_direct<ThreadDim>,
-                        EnclosedStmts ...>,
-                        Types>
-  : public SyclStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_local_012_direct<ThreadDim>,
-                    EnclosedStmts ...>,
-                    Types> {
-
-  using Base = SyclStatementExecutor<
+    statement::TileTCount<ArgumentId,
+                          ParamId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_local_012_direct<ThreadDim>,
+                          EnclosedStmts...>,
+    Types>
+    : public SyclStatementExecutor<
           Data,
           statement::Tile<ArgumentId,
                           RAJA::tile_fixed<chunk_size>,
                           sycl_local_012_direct<ThreadDim>,
-                          EnclosedStmts ...>,
-                          Types>;
+                          EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::Tile<ArgumentId,
+                                            RAJA::tile_fixed<chunk_size>,
+                                            sycl_local_012_direct<ThreadDim>,
+                                            EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
     diff_t len = segment.end() - segment.begin();
-    //diff_t t = get_sycl_dim<ThreadDim>(threadIdx);
+    // diff_t t = get_sycl_dim<ThreadDim>(threadIdx);
     diff_t t = item.get_local_id(ThreadDim);
     diff_t i = t * chunk_size;
 
@@ -310,7 +316,7 @@ struct SyclStatementExecutor<
 
     // Assign our new tiled segment
     diff_t slice_size = have_work ? chunk_size : 0;
-    segment = orig_segment.slice(i, slice_size);
+    segment           = orig_segment.slice(i, slice_size);
     data.template assign_param<ParamId>(t);
 
     // execute enclosed statements
@@ -332,57 +338,58 @@ template <typename Data,
           typename ParamId,
           camp::idx_t chunk_size,
           int ThreadDim,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<
-  Data,
-  statement::TileTCount<ArgumentId, ParamId,
-                        RAJA::tile_fixed<chunk_size>,
-                        sycl_local_012_loop<ThreadDim>,
-                        EnclosedStmts ...>,
-                        Types>
-  : public SyclStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_local_012_loop<ThreadDim>,
-                    EnclosedStmts ...>,
-                    Types> {
-
-  using Base = SyclStatementExecutor<
+    statement::TileTCount<ArgumentId,
+                          ParamId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_local_012_loop<ThreadDim>,
+                          EnclosedStmts...>,
+    Types>
+    : public SyclStatementExecutor<
           Data,
           statement::Tile<ArgumentId,
                           RAJA::tile_fixed<chunk_size>,
                           sycl_local_012_loop<ThreadDim>,
-                          EnclosedStmts ...>,
-                          Types>;
+                          EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::Tile<ArgumentId,
+                                            RAJA::tile_fixed<chunk_size>,
+                                            sycl_local_012_loop<ThreadDim>,
+                                            EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
     diff_t len = segment_length<ArgumentId>(data);
-//    diff_t t_init = get_sycl_dim<ThreadDim>(threadIdx);
+    //    diff_t t_init = get_sycl_dim<ThreadDim>(threadIdx);
     diff_t t_init = item.get_local_id(ThreadDim);
     diff_t i_init = t_init * chunk_size;
-//    diff_t t_stride = get_sycl_dim<ThreadDim>(blockDim);
+    //    diff_t t_stride = get_sycl_dim<ThreadDim>(blockDim);
     diff_t t_stride = item.get_local_range(ThreadDim);
     diff_t i_stride = t_stride * chunk_size;
 
     // Iterate through grid stride of chunks
-    for(diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride) {
+    for (diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride)
+    {
       diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -391,7 +398,7 @@ struct SyclStatementExecutor<
 
       // Assign our new tiled segment
       diff_t slice_size = have_work ? chunk_size : 0;
-      segment = orig_segment.slice(i, slice_size);
+      segment           = orig_segment.slice(i, slice_size);
       data.template assign_param<ParamId>(t);
 
       // execute enclosed statements
diff --git a/include/RAJA/policy/sycl/kernel/internal.hpp b/include/RAJA/policy/sycl/kernel/internal.hpp
index 56e3a9aa1e..4c68cf58b8 100644
--- a/include/RAJA/policy/sycl/kernel/internal.hpp
+++ b/include/RAJA/policy/sycl/kernel/internal.hpp
@@ -43,7 +43,8 @@ namespace internal
 {
 
 // LaunchDims and Helper functions
-struct LaunchDims {
+struct LaunchDims
+{
   sycl_dim_3_t group;
   sycl_dim_3_t local;
   sycl_dim_3_t global;
@@ -52,22 +53,22 @@ struct LaunchDims {
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  LaunchDims() : group{0,0,0},
-                 local{1,1,1},
-                 global{1,1,1},
-                 min_groups{0,0,0},
-                 min_locals{0,0,0} {}
+  LaunchDims()
+      : group {0, 0, 0},
+        local {1, 1, 1},
+        global {1, 1, 1},
+        min_groups {0, 0, 0},
+        min_locals {0, 0, 0}
+  {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  LaunchDims(LaunchDims const &c) : group(c.group),
-                                    local(c.local),
-                                    global(c.global)
-  {
-  }
+  LaunchDims(LaunchDims const& c)
+      : group(c.group), local(c.local), global(c.global)
+  {}
 
   RAJA_INLINE
-  LaunchDims max(LaunchDims const &c) const
+  LaunchDims max(LaunchDims const& c) const
   {
     LaunchDims result;
 
@@ -86,89 +87,115 @@ struct LaunchDims {
     return result;
   }
 
-  cl::sycl::nd_range<3> fit_nd_range(::sycl::queue* q) {
+  cl::sycl::nd_range<3> fit_nd_range(::sycl::queue* q)
+  {
 
     sycl_dim_3_t launch_global;
 
-    sycl_dim_3_t launch_local {1,1,1};
-    launch_local.x = std::max(launch_local.x, local.x); 
+    sycl_dim_3_t launch_local {1, 1, 1};
+    launch_local.x = std::max(launch_local.x, local.x);
     launch_local.y = std::max(launch_local.y, local.y);
     launch_local.z = std::max(launch_local.z, local.z);
 
     cl::sycl::device dev = q->get_device();
 
-    auto max_work_group_size = dev.get_info< ::cl::sycl::info::device::max_work_group_size>();
+    auto max_work_group_size =
+        dev.get_info<::cl::sycl::info::device::max_work_group_size>();
 
-    if(launch_local.x > max_work_group_size) {
+    if (launch_local.x > max_work_group_size)
+    {
       launch_local.x = max_work_group_size;
     }
-    if(launch_local.y > max_work_group_size) {
+    if (launch_local.y > max_work_group_size)
+    {
       launch_local.y = max_work_group_size;
     }
-    if(launch_local.z > max_work_group_size) {
+    if (launch_local.z > max_work_group_size)
+    {
       launch_local.z = max_work_group_size;
     }
 
 
     // Make sure the multiple of locals fits
     // Prefer larger z -> y -> x
-    if(launch_local.x * launch_local.y * launch_local.z > max_work_group_size) {
+    if (launch_local.x * launch_local.y * launch_local.z > max_work_group_size)
+    {
       int remaining = 1;
       // local z cannot be > max_wrk from above
-      // if equal then remaining is 1, on handle < 
-      if(max_work_group_size > launch_local.z) {
+      // if equal then remaining is 1, on handle <
+      if (max_work_group_size > launch_local.z)
+      {
         // keep local z
         remaining = max_work_group_size / launch_local.z;
       }
-      if(remaining >= launch_local.y) {
+      if (remaining >= launch_local.y)
+      {
         // keep local y
         remaining = remaining / launch_local.y;
-      } else {
+      }
+      else
+      {
         launch_local.y = remaining;
-        remaining = remaining / launch_local.y;
+        remaining      = remaining / launch_local.y;
       }
-      if(remaining < launch_local.x) {
+      if (remaining < launch_local.x)
+      {
         launch_local.x = remaining;
       }
     }
 
 
     // User gave group policy, use to calculate global space
-    if (group.x != 0 || group.y != 0 || group.z != 0) {
-      sycl_dim_3_t launch_group {1,1,1};
+    if (group.x != 0 || group.y != 0 || group.z != 0)
+    {
+      sycl_dim_3_t launch_group {1, 1, 1};
       launch_group.x = std::max(launch_group.x, group.x);
       launch_group.y = std::max(launch_group.y, group.y);
       launch_group.z = std::max(launch_group.z, group.z);
 
       launch_global.x = launch_local.x * launch_group.x;
-      launch_global.y = launch_local.y * launch_group.y; 
+      launch_global.y = launch_local.y * launch_group.y;
       launch_global.z = launch_local.z * launch_group.z;
-    } else {
-      launch_global.x = launch_local.x * ((global.x + (launch_local.x - 1)) / launch_local.x);
-      launch_global.y = launch_local.y * ((global.y + (launch_local.y - 1)) / launch_local.y);
-      launch_global.z = launch_local.z * ((global.z + (launch_local.z - 1)) / launch_local.z);
+    }
+    else
+    {
+      launch_global.x =
+          launch_local.x * ((global.x + (launch_local.x - 1)) / launch_local.x);
+      launch_global.y =
+          launch_local.y * ((global.y + (launch_local.y - 1)) / launch_local.y);
+      launch_global.z =
+          launch_local.z * ((global.z + (launch_local.z - 1)) / launch_local.z);
     }
 
 
-    if(launch_global.x % launch_local.x != 0) {
-      launch_global.x = ((launch_global.x / launch_local.x) + 1) * launch_local.x; 
+    if (launch_global.x % launch_local.x != 0)
+    {
+      launch_global.x =
+          ((launch_global.x / launch_local.x) + 1) * launch_local.x;
     }
-    if(launch_global.y % launch_local.y != 0) {
-      launch_global.y = ((launch_global.y / launch_local.y) + 1) * launch_local.y; 
+    if (launch_global.y % launch_local.y != 0)
+    {
+      launch_global.y =
+          ((launch_global.y / launch_local.y) + 1) * launch_local.y;
     }
-    if(launch_global.z % launch_local.z != 0) {
-      launch_global.z = ((launch_global.z / launch_local.z) + 1) * launch_local.z; 
+    if (launch_global.z % launch_local.z != 0)
+    {
+      launch_global.z =
+          ((launch_global.z / launch_local.z) + 1) * launch_local.z;
     }
 
-    cl::sycl::range<3> ret_th = {launch_local.x, launch_local.y, launch_local.z};
-    cl::sycl::range<3> ret_gl = {launch_global.x, launch_global.y, launch_global.z};
+    cl::sycl::range<3> ret_th = {launch_local.x, launch_local.y,
+                                 launch_local.z};
+    cl::sycl::range<3> ret_gl = {launch_global.x, launch_global.y,
+                                 launch_global.z};
 
     return cl::sycl::nd_range<3>(ret_gl, ret_th);
   }
 };
 
 template <camp::idx_t cur_stmt, camp::idx_t num_stmts, typename StmtList>
-struct SyclStatementListExecutorHelper {
+struct SyclStatementListExecutorHelper
+{
 
   using next_helper_t =
       SyclStatementListExecutorHelper<cur_stmt + 1, num_stmts, StmtList>;
@@ -176,7 +203,8 @@ struct SyclStatementListExecutorHelper {
   using cur_stmt_t = camp::at_v<StmtList, cur_stmt>;
 
   template <typename Data>
-  inline static RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  inline static RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Execute stmt
     cur_stmt_t::exec(data, item, thread_active);
@@ -186,7 +214,7 @@ struct SyclStatementListExecutorHelper {
   }
 
   template <typename Data>
-  inline static LaunchDims calculateDimensions(Data &data)
+  inline static LaunchDims calculateDimensions(Data& data)
   {
     // Compute this statements launch dimensions
     LaunchDims statement_dims = cur_stmt_t::calculateDimensions(data);
@@ -200,16 +228,17 @@ struct SyclStatementListExecutorHelper {
 };
 
 template <camp::idx_t num_stmts, typename StmtList>
-struct SyclStatementListExecutorHelper<num_stmts, num_stmts, StmtList> {
+struct SyclStatementListExecutorHelper<num_stmts, num_stmts, StmtList>
+{
 
   template <typename Data>
-  inline static RAJA_DEVICE void exec(Data &, cl::sycl::nd_item<3> item, bool)
+  inline static RAJA_DEVICE void exec(Data&, cl::sycl::nd_item<3> item, bool)
   {
     // nop terminator
   }
 
   template <typename Data>
-  inline static LaunchDims calculateDimensions(Data &)
+  inline static LaunchDims calculateDimensions(Data&)
   {
     return LaunchDims();
   }
@@ -223,37 +252,33 @@ struct SyclStatementListExecutor;
 
 
 template <typename Data, typename... Stmts, typename Types>
-struct SyclStatementListExecutor<Data, StatementList<Stmts...>, Types> {
+struct SyclStatementListExecutor<Data, StatementList<Stmts...>, Types>
+{
 
   using enclosed_stmts_t =
       camp::list<SyclStatementExecutor<Data, Stmts, Types>...>;
 
   static constexpr size_t num_stmts = sizeof...(Stmts);
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Execute statements in order with helper class
-    SyclStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(data, item, thread_active);
+    SyclStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(
+        data, item, thread_active);
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute this statements launch dimensions
-    return SyclStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::
-        calculateDimensions(data);
+    return SyclStatementListExecutorHelper<
+        0, num_stmts, enclosed_stmts_t>::calculateDimensions(data);
   }
 };
 
 template <typename StmtList, typename Data, typename Types>
-using sycl_statement_list_executor_t = SyclStatementListExecutor<
-    Data,
-    StmtList,
-    Types>;
+using sycl_statement_list_executor_t =
+    SyclStatementListExecutor<Data, StmtList, Types>;
 
 }  // namespace internal
 }  // namespace RAJA
diff --git a/include/RAJA/policy/sycl/launch.hpp b/include/RAJA/policy/sycl/launch.hpp
index ad9fecc222..5cef8f570d 100644
--- a/include/RAJA/policy/sycl/launch.hpp
+++ b/include/RAJA/policy/sycl/launch.hpp
@@ -22,23 +22,30 @@
 #include "RAJA/pattern/detail/privatizer.hpp"
 #include "RAJA/policy/sycl/policy.hpp"
 #include "RAJA/policy/sycl/MemUtils_SYCL.hpp"
-//#include "RAJA/policy/sycl/raja_syclerrchk.hpp"
+// #include "RAJA/policy/sycl/raja_syclerrchk.hpp"
 #include "RAJA/util/resource.hpp"
 
 namespace RAJA
 {
 
 template <bool async>
-struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
-
- //If the launch lambda is trivially copyable
-  template <typename BODY_IN, typename ReduceParams,
-	    typename std::enable_if<std::is_trivially_copyable<BODY_IN>{},bool>::type = true>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params, const char *kernel_name,
-       BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
+{
+
+  // If the launch lambda is trivially copyable
+  template <typename BODY_IN,
+            typename ReduceParams,
+            typename std::enable_if<std::is_trivially_copyable<BODY_IN> {},
+                                    bool>::type = true>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
 
     /*Get the queue from concrete resource */
@@ -49,57 +56,72 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
     //
 
     const ::sycl::range<3> blockSize(params.threads.value[2],
-				     params.threads.value[1],
-				     params.threads.value[0]);
+                                     params.threads.value[1],
+                                     params.threads.value[0]);
 
-    const ::sycl::range<3> gridSize(params.threads.value[2] * params.teams.value[2],
-				    params.threads.value[1] * params.teams.value[1],
-				    params.threads.value[0] * params.teams.value[0]);
+    const ::sycl::range<3> gridSize(
+        params.threads.value[2] * params.teams.value[2],
+        params.threads.value[1] * params.teams.value[1],
+        params.threads.value[0] * params.teams.value[0]);
 
     // Only launch kernel if we have something to iterate over
     constexpr size_t zero = 0;
-    if ( params.threads.value[0]  > zero && params.threads.value[1]  > zero && params.threads.value[2] > zero &&
-         params.teams.value[0] > zero && params.teams.value[1] > zero && params.teams.value[2]> zero ) {
+    if (params.threads.value[0] > zero && params.threads.value[1] > zero &&
+        params.threads.value[2] > zero && params.teams.value[0] > zero &&
+        params.teams.value[1] > zero && params.teams.value[2] > zero)
+    {
 
       RAJA_FT_BEGIN;
 
-      q->submit([&](cl::sycl::handler& h) {
-
-        auto s_vec = ::sycl::local_accessor<char, 1> (params.shared_mem_size, h);
-
-        h.parallel_for
-          (cl::sycl::nd_range<3>(gridSize, blockSize),
-           [=] (cl::sycl::nd_item<3> itm) {
-
-            LaunchContext ctx;
-            ctx.itm = &itm;
-
-            //Point to shared memory
-            ctx.shared_mem_ptr = s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
-
-            body_in(ctx);
-
-           });
-
-      });
-
-    if (!async) { q->wait(); }
+      q->submit(
+          [&](cl::sycl::handler& h)
+          {
+            auto s_vec =
+                ::sycl::local_accessor<char, 1>(params.shared_mem_size, h);
+
+            h.parallel_for(
+                cl::sycl::nd_range<3>(gridSize, blockSize),
+                [=](cl::sycl::nd_item<3> itm)
+                {
+                  LaunchContext ctx;
+                  ctx.itm = &itm;
+
+                  // Point to shared memory
+                  ctx.shared_mem_ptr =
+                      s_vec.get_multi_ptr<::sycl::access::decorated::yes>()
+                          .get();
+
+                  body_in(ctx);
+                });
+          });
+
+      if (!async)
+      {
+        q->wait();
+      }
 
       RAJA_FT_END;
-
     }
 
     return resources::EventProxy<resources::Resource>(res);
   }
 
- //If the launch lambda is trivially copyable and we have explcit reduction parameters
-  template <typename BODY_IN, typename ReduceParams,
-	    typename std::enable_if<std::is_trivially_copyable<BODY_IN>{},bool>::type = true>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, const LaunchParams &launch_params, const char *kernel_name,
-       BODY_IN &&body_in, ReduceParams launch_reducers)
+  // If the launch lambda is trivially copyable and we have explcit reduction
+  // parameters
+  template <typename BODY_IN,
+            typename ReduceParams,
+            typename std::enable_if<std::is_trivially_copyable<BODY_IN> {},
+                                    bool>::type = true>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams launch_reducers)
   {
 
     /*Get the queue from concrete resource */
@@ -112,57 +134,66 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
     // Compute the number of blocks and threads
     //
     const ::sycl::range<3> blockSize(launch_params.threads.value[2],
-				     launch_params.threads.value[1],
-				     launch_params.threads.value[0]);
+                                     launch_params.threads.value[1],
+                                     launch_params.threads.value[0]);
 
-    const ::sycl::range<3> gridSize(launch_params.threads.value[2] * launch_params.teams.value[2],
-				    launch_params.threads.value[1] * launch_params.teams.value[1],
-				    launch_params.threads.value[0] * launch_params.teams.value[0]);
+    const ::sycl::range<3> gridSize(
+        launch_params.threads.value[2] * launch_params.teams.value[2],
+        launch_params.threads.value[1] * launch_params.teams.value[1],
+        launch_params.threads.value[0] * launch_params.teams.value[0]);
 
     // Only launch kernel if we have something to iterate over
     constexpr size_t zero = 0;
-    if ( launch_params.threads.value[0]  > zero && launch_params.threads.value[1]  > zero && launch_params.threads.value[2] > zero &&
-         launch_params.teams.value[0] > zero && launch_params.teams.value[1] > zero && launch_params.teams.value[2]> zero ) {
+    if (launch_params.threads.value[0] > zero &&
+        launch_params.threads.value[1] > zero &&
+        launch_params.threads.value[2] > zero &&
+        launch_params.teams.value[0] > zero &&
+        launch_params.teams.value[1] > zero &&
+        launch_params.teams.value[2] > zero)
+    {
 
 
-      auto combiner = []( ReduceParams x, ReduceParams y ) {
-        RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( x, y );
+      auto combiner = [](ReduceParams x, ReduceParams y)
+      {
+        RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(x, y);
         return x;
-       };
+      };
 
       RAJA_FT_BEGIN;
 
-      ReduceParams* res = ::sycl::malloc_shared<ReduceParams>(1,*q);
+      ReduceParams* res = ::sycl::malloc_shared<ReduceParams>(1, *q);
       RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
       auto reduction = ::sycl::reduction(res, launch_reducers, combiner);
 
-      q->submit([&](cl::sycl::handler& h) {
-
-       auto s_vec = ::sycl::local_accessor<char, 1> (launch_params.shared_mem_size, h);
-
-        h.parallel_for
-          (cl::sycl::nd_range<3>(gridSize, blockSize),
-           reduction,
-           [=] (cl::sycl::nd_item<3> itm, auto & red) {
+      q->submit(
+           [&](cl::sycl::handler& h)
+           {
+             auto s_vec = ::sycl::local_accessor<char, 1>(
+                 launch_params.shared_mem_size, h);
 
-            LaunchContext ctx;
-            ctx.itm = &itm;
+             h.parallel_for(
+                 cl::sycl::nd_range<3>(gridSize, blockSize), reduction,
+                 [=](cl::sycl::nd_item<3> itm, auto& red)
+                 {
+                   LaunchContext ctx;
+                   ctx.itm = &itm;
 
-            //Point to shared memory
-            ctx.shared_mem_ptr = s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+                   // Point to shared memory
+                   ctx.shared_mem_ptr =
+                       s_vec.get_multi_ptr<::sycl::access::decorated::yes>()
+                           .get();
 
-            ReduceParams fp;
-            RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+                   ReduceParams fp;
+                   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
 
-            RAJA::expt::invoke_body(fp, body_in, ctx);
+                   RAJA::expt::invoke_body(fp, body_in, ctx);
 
-            red.combine(fp);
+                   red.combine(fp);
+                 });
+           })
+          .wait();  // Need to wait for completion to free memory
 
-           });
-
-      }).wait(); // Need to wait for completion to free memory
-
-      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( launch_reducers, *res );
+      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(launch_reducers, *res);
       ::sycl::free(res, *q);
 
       RAJA_FT_END;
@@ -170,17 +201,23 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
 
     RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers);
 
-   return resources::EventProxy<resources::Resource>(res);
+    return resources::EventProxy<resources::Resource>(res);
   }
 
-  //If the launch lambda is not trivially copyable
-  template <typename BODY_IN, typename ReduceParams,
-	    typename std::enable_if<!std::is_trivially_copyable<BODY_IN>{},bool>::type = true>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params, const char *kernel_name,
-       BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  // If the launch lambda is not trivially copyable
+  template <typename BODY_IN,
+            typename ReduceParams,
+            typename std::enable_if<!std::is_trivially_copyable<BODY_IN> {},
+                                    bool>::type = true>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
 
     /*Get the queue from concrete resource */
@@ -191,67 +228,79 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
     //
 
     const ::sycl::range<3> blockSize(params.threads.value[2],
-				     params.threads.value[1],
-				     params.threads.value[0]);
+                                     params.threads.value[1],
+                                     params.threads.value[0]);
 
-    const ::sycl::range<3> gridSize(params.threads.value[2] * params.teams.value[2],
-				    params.threads.value[1] * params.teams.value[1],
-				    params.threads.value[0] * params.teams.value[0]);
+    const ::sycl::range<3> gridSize(
+        params.threads.value[2] * params.teams.value[2],
+        params.threads.value[1] * params.teams.value[1],
+        params.threads.value[0] * params.teams.value[0]);
 
     // Only launch kernel if we have something to iterate over
     constexpr size_t zero = 0;
-    if ( params.threads.value[0]  > zero && params.threads.value[1]  > zero && params.threads.value[2] > zero &&
-         params.teams.value[0] > zero && params.teams.value[1] > zero && params.teams.value[2]> zero ) {
+    if (params.threads.value[0] > zero && params.threads.value[1] > zero &&
+        params.threads.value[2] > zero && params.teams.value[0] > zero &&
+        params.teams.value[1] > zero && params.teams.value[2] > zero)
+    {
 
       RAJA_FT_BEGIN;
 
       //
-      // Kernel body is nontrivially copyable, create space on device and copy to
-      // Workaround until "is_device_copyable" is supported
+      // Kernel body is nontrivially copyable, create space on device and copy
+      // to Workaround until "is_device_copyable" is supported
       //
       using LOOP_BODY = camp::decay<BODY_IN>;
       LOOP_BODY* lbody;
-      lbody = (LOOP_BODY*) cl::sycl::malloc_device(sizeof(LOOP_BODY), *q);
+      lbody = (LOOP_BODY*)cl::sycl::malloc_device(sizeof(LOOP_BODY), *q);
       q->memcpy(lbody, &body_in, sizeof(LOOP_BODY)).wait();
 
-      q->submit([&](cl::sycl::handler& h) {
-
-        auto s_vec = ::sycl::local_accessor<char, 1> (params.shared_mem_size, h);
-
-        h.parallel_for
-          (cl::sycl::nd_range<3>(gridSize, blockSize),
-           [=] (cl::sycl::nd_item<3> itm) {
-
-            LaunchContext ctx;
-            ctx.itm = &itm;
-
-            //Point to shared memory
-            ctx.shared_mem_ptr = s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
-
-            (*lbody)(ctx);
-
-           });
-
-      }).wait(); // Need to wait for completion to free memory
+      q->submit(
+           [&](cl::sycl::handler& h)
+           {
+             auto s_vec =
+                 ::sycl::local_accessor<char, 1>(params.shared_mem_size, h);
+
+             h.parallel_for(
+                 cl::sycl::nd_range<3>(gridSize, blockSize),
+                 [=](cl::sycl::nd_item<3> itm)
+                 {
+                   LaunchContext ctx;
+                   ctx.itm = &itm;
+
+                   // Point to shared memory
+                   ctx.shared_mem_ptr =
+                       s_vec.get_multi_ptr<::sycl::access::decorated::yes>()
+                           .get();
+
+                   (*lbody)(ctx);
+                 });
+           })
+          .wait();  // Need to wait for completion to free memory
 
       cl::sycl::free(lbody, *q);
 
       RAJA_FT_END;
-
     }
 
     return resources::EventProxy<resources::Resource>(res);
   }
 
 
-  //If the launch lambda is not trivially copyable
-  template <typename BODY_IN, typename ReduceParams,
-	    typename std::enable_if<!std::is_trivially_copyable<BODY_IN>{},bool>::type = true>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-    exec(RAJA::resources::Resource res, const LaunchParams &launch_params, const char *kernel_name,
-         BODY_IN &&body_in, ReduceParams launch_reducers)
+  // If the launch lambda is not trivially copyable
+  template <typename BODY_IN,
+            typename ReduceParams,
+            typename std::enable_if<!std::is_trivially_copyable<BODY_IN> {},
+                                    bool>::type = true>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams launch_reducers)
   {
 
     /*Get the queue from concrete resource */
@@ -264,66 +313,75 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
     // Compute the number of blocks and threads
     //
     const ::sycl::range<3> blockSize(launch_params.threads.value[2],
-				     launch_params.threads.value[1],
-				     launch_params.threads.value[0]);
+                                     launch_params.threads.value[1],
+                                     launch_params.threads.value[0]);
 
-    const ::sycl::range<3> gridSize(launch_params.threads.value[2] * launch_params.teams.value[2],
-				    launch_params.threads.value[1] * launch_params.teams.value[1],
-				    launch_params.threads.value[0] * launch_params.teams.value[0]);
+    const ::sycl::range<3> gridSize(
+        launch_params.threads.value[2] * launch_params.teams.value[2],
+        launch_params.threads.value[1] * launch_params.teams.value[1],
+        launch_params.threads.value[0] * launch_params.teams.value[0]);
 
     // Only launch kernel if we have something to iterate over
     constexpr size_t zero = 0;
-    if ( launch_params.threads.value[0]  > zero && launch_params.threads.value[1]  > zero && launch_params.threads.value[2] > zero &&
-         launch_params.teams.value[0] > zero && launch_params.teams.value[1] > zero && launch_params.teams.value[2]> zero ) {
+    if (launch_params.threads.value[0] > zero &&
+        launch_params.threads.value[1] > zero &&
+        launch_params.threads.value[2] > zero &&
+        launch_params.teams.value[0] > zero &&
+        launch_params.teams.value[1] > zero &&
+        launch_params.teams.value[2] > zero)
+    {
 
 
-      auto combiner = []( ReduceParams x, ReduceParams y ) {
-        RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( x, y );
+      auto combiner = [](ReduceParams x, ReduceParams y)
+      {
+        RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(x, y);
         return x;
-       };
+      };
 
       RAJA_FT_BEGIN;
 
       //
-      // Kernel body is nontrivially copyable, create space on device and copy to
-      // Workaround until "is_device_copyable" is supported
+      // Kernel body is nontrivially copyable, create space on device and copy
+      // to Workaround until "is_device_copyable" is supported
       //
       using LOOP_BODY = camp::decay<BODY_IN>;
       LOOP_BODY* lbody;
-      lbody = (LOOP_BODY*) cl::sycl::malloc_device(sizeof(LOOP_BODY), *q);
+      lbody = (LOOP_BODY*)cl::sycl::malloc_device(sizeof(LOOP_BODY), *q);
       q->memcpy(lbody, &body_in, sizeof(LOOP_BODY)).wait();
 
-      ReduceParams* res = ::sycl::malloc_shared<ReduceParams>(1,*q);
+      ReduceParams* res = ::sycl::malloc_shared<ReduceParams>(1, *q);
       RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
       auto reduction = ::sycl::reduction(res, launch_reducers, combiner);
 
-      q->submit([&](cl::sycl::handler& h) {
-
-       auto s_vec = ::sycl::local_accessor<char, 1> (launch_params.shared_mem_size, h);
-
-        h.parallel_for
-          (cl::sycl::nd_range<3>(gridSize, blockSize),
-           reduction,
-           [=] (cl::sycl::nd_item<3> itm, auto & red) {
-
-            LaunchContext ctx;
-            ctx.itm = &itm;
-
-            //Point to shared memory
-            ctx.shared_mem_ptr = s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+      q->submit(
+           [&](cl::sycl::handler& h)
+           {
+             auto s_vec = ::sycl::local_accessor<char, 1>(
+                 launch_params.shared_mem_size, h);
 
-            ReduceParams fp;
-            RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+             h.parallel_for(
+                 cl::sycl::nd_range<3>(gridSize, blockSize), reduction,
+                 [=](cl::sycl::nd_item<3> itm, auto& red)
+                 {
+                   LaunchContext ctx;
+                   ctx.itm = &itm;
 
-            RAJA::expt::invoke_body(fp, *lbody, ctx);
+                   // Point to shared memory
+                   ctx.shared_mem_ptr =
+                       s_vec.get_multi_ptr<::sycl::access::decorated::yes>()
+                           .get();
 
-            red.combine(fp);
+                   ReduceParams fp;
+                   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
 
-           });
+                   RAJA::expt::invoke_body(fp, *lbody, ctx);
 
-      }).wait(); // Need to wait for completion to free memory
+                   red.combine(fp);
+                 });
+           })
+          .wait();  // Need to wait for completion to free memory
 
-      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( launch_reducers, *res );
+      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(launch_reducers, *res);
       ::sycl::free(res, *q);
       cl::sycl::free(lbody, *q);
 
@@ -332,15 +390,14 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
 
     RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers);
 
-   return resources::EventProxy<resources::Resource>(res);
+    return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
 /*
    SYCL global thread mapping
 */
-template<int ... DIM>
+template <int... DIM>
 struct sycl_global_item;
 
 using sycl_global_item_0 = sycl_global_item<0>;
@@ -348,53 +405,49 @@ using sycl_global_item_1 = sycl_global_item<1>;
 using sycl_global_item_2 = sycl_global_item<2>;
 
 template <typename SEGMENT, int DIM>
-struct LoopExecute<sycl_global_item<DIM>, SEGMENT> {
+struct LoopExecute<sycl_global_item<DIM>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
     {
-      const int tx =
-        ctx.itm->get_group(DIM) * ctx.itm->get_local_range(DIM) +
-        ctx.itm->get_local_id(DIM);
+      const int tx = ctx.itm->get_group(DIM) * ctx.itm->get_local_range(DIM) +
+                     ctx.itm->get_local_id(DIM);
 
       if (tx < len) body(*(segment.begin() + tx));
     }
   }
 };
 
-using sycl_global_item_01 = sycl_global_item<0,1>;
-using sycl_global_item_02 = sycl_global_item<0,2>;
-using sycl_global_item_10 = sycl_global_item<1,0>;
-using sycl_global_item_12 = sycl_global_item<1,2>;
-using sycl_global_item_20 = sycl_global_item<2,0>;
-using sycl_global_item_21 = sycl_global_item<2,1>;
+using sycl_global_item_01 = sycl_global_item<0, 1>;
+using sycl_global_item_02 = sycl_global_item<0, 2>;
+using sycl_global_item_10 = sycl_global_item<1, 0>;
+using sycl_global_item_12 = sycl_global_item<1, 2>;
+using sycl_global_item_20 = sycl_global_item<2, 0>;
+using sycl_global_item_21 = sycl_global_item<2, 1>;
 
 template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopExecute<sycl_global_item<DIM0, DIM1>, SEGMENT> {
+struct LoopExecute<sycl_global_item<DIM0, DIM1>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           BODY const& body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
-      const int tx =
-        ctx.itm->get_group(DIM0) * ctx.itm->get_local_range(DIM0) +
-        ctx.itm->get_local_id(DIM0);
+      const int tx = ctx.itm->get_group(DIM0) * ctx.itm->get_local_range(DIM0) +
+                     ctx.itm->get_local_id(DIM0);
 
-      const int ty =
-        ctx.itm->get_group(DIM1) * ctx.itm->get_local_range(DIM1) +
-        ctx.itm->get_local_id(DIM1);
+      const int ty = ctx.itm->get_group(DIM1) * ctx.itm->get_local_range(DIM1) +
+                     ctx.itm->get_local_id(DIM1);
 
 
       if (tx < len0 && ty < len1)
@@ -404,43 +457,39 @@ struct LoopExecute<sycl_global_item<DIM0, DIM1>, SEGMENT> {
 };
 
 
-using sycl_global_item_012 = sycl_global_item<0,1,2>;
-using sycl_global_item_021 = sycl_global_item<0,2,1>;
-using sycl_global_item_102 = sycl_global_item<1,0,2>;
-using sycl_global_item_120 = sycl_global_item<1,2,0>;
-using sycl_global_item_201 = sycl_global_item<2,0,1>;
-using sycl_global_item_210 = sycl_global_item<2,1,0>;
+using sycl_global_item_012 = sycl_global_item<0, 1, 2>;
+using sycl_global_item_021 = sycl_global_item<0, 2, 1>;
+using sycl_global_item_102 = sycl_global_item<1, 0, 2>;
+using sycl_global_item_120 = sycl_global_item<1, 2, 0>;
+using sycl_global_item_201 = sycl_global_item<2, 0, 1>;
+using sycl_global_item_210 = sycl_global_item<2, 1, 0>;
 
 template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopExecute<sycl_global_item<DIM0, DIM1, DIM2>, SEGMENT> {
+struct LoopExecute<sycl_global_item<DIM0, DIM1, DIM2>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           SEGMENT const& segment2,
+                                           BODY const& body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
-      const int tx =
-        ctx.itm->get_group(DIM0) * ctx.itm->get_local_range(DIM0) +
-        ctx.itm->get_local_id(DIM0);
+      const int tx = ctx.itm->get_group(DIM0) * ctx.itm->get_local_range(DIM0) +
+                     ctx.itm->get_local_id(DIM0);
 
-      const int ty =
-        ctx.itm->get_group(DIM1) * ctx.itm->get_local_range(DIM1) +
-        ctx.itm->get_local_id(DIM1);
+      const int ty = ctx.itm->get_group(DIM1) * ctx.itm->get_local_range(DIM1) +
+                     ctx.itm->get_local_id(DIM1);
 
-      const int tz =
-        ctx.itm->get_group(DIM2) * ctx.itm->get_local_range(DIM2) +
-        ctx.itm->get_local_id(DIM2);
+      const int tz = ctx.itm->get_group(DIM2) * ctx.itm->get_local_range(DIM2) +
+                     ctx.itm->get_local_id(DIM2);
 
       if (tx < len0 && ty < len1 && tz < len2)
-        body(*(segment0.begin() + tx),
-             *(segment1.begin() + ty),
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
              *(segment1.begin() + ty));
     }
   }
@@ -449,70 +498,86 @@ struct LoopExecute<sycl_global_item<DIM0, DIM1, DIM2>, SEGMENT> {
 /*
 Reshape threads in a block into a 1D iteration space
 */
-template<int ... dim>
-struct sycl_flatten_group_local_direct{};
-
-using sycl_flatten_group_local_01_direct = sycl_flatten_group_local_direct<0,1>;
-using sycl_flatten_group_local_02_direct = sycl_flatten_group_local_direct<0,2>;
-using sycl_flatten_group_local_10_direct = sycl_flatten_group_local_direct<1,0>;
-using sycl_flatten_group_local_12_direct = sycl_flatten_group_local_direct<1,2>;
-using sycl_flatten_group_local_20_direct = sycl_flatten_group_local_direct<2,0>;
-using sycl_flatten_group_local_21_direct = sycl_flatten_group_local_direct<2,1>;
-
-using sycl_flatten_group_local_012_direct = sycl_flatten_group_local_direct<0,1,2>;
-using sycl_flatten_group_local_021_direct = sycl_flatten_group_local_direct<0,2,1>;
-using sycl_flatten_group_local_102_direct = sycl_flatten_group_local_direct<1,0,2>;
-using sycl_flatten_group_local_120_direct = sycl_flatten_group_local_direct<1,2,0>;
-using sycl_flatten_group_local_201_direct = sycl_flatten_group_local_direct<2,0,1>;
-using sycl_flatten_group_local_210_direct = sycl_flatten_group_local_direct<2,1,0>;
-
-template<int ... dim>
-struct sycl_flatten_group_local_loop{};
-
-using sycl_flatten_group_local_01_loop = sycl_flatten_group_local_loop<0,1>;
-using sycl_flatten_group_local_02_loop = sycl_flatten_group_local_loop<0,2>;
-using sycl_flatten_group_local_10_loop = sycl_flatten_group_local_loop<1,0>;
-using sycl_flatten_group_local_12_loop = sycl_flatten_group_local_loop<1,2>;
-using sycl_flatten_group_local_20_loop = sycl_flatten_group_local_loop<2,0>;
-using sycl_flatten_group_local_21_loop = sycl_flatten_group_local_loop<2,1>;
-
-using sycl_flatten_group_local_012_loop = sycl_flatten_group_local_loop<0,1,2>;
-using sycl_flatten_group_local_021_loop = sycl_flatten_group_local_loop<0,2,1>;
-using sycl_flatten_group_local_102_loop = sycl_flatten_group_local_loop<1,0,2>;
-using sycl_flatten_group_local_120_loop = sycl_flatten_group_local_loop<1,2,0>;
-using sycl_flatten_group_local_201_loop = sycl_flatten_group_local_loop<2,0,1>;
-using sycl_flatten_group_local_210_loop = sycl_flatten_group_local_loop<2,1,0>;
-
-template<typename SEGMENT, int DIM0, int DIM1>
+template <int... dim>
+struct sycl_flatten_group_local_direct
+{};
+
+using sycl_flatten_group_local_01_direct =
+    sycl_flatten_group_local_direct<0, 1>;
+using sycl_flatten_group_local_02_direct =
+    sycl_flatten_group_local_direct<0, 2>;
+using sycl_flatten_group_local_10_direct =
+    sycl_flatten_group_local_direct<1, 0>;
+using sycl_flatten_group_local_12_direct =
+    sycl_flatten_group_local_direct<1, 2>;
+using sycl_flatten_group_local_20_direct =
+    sycl_flatten_group_local_direct<2, 0>;
+using sycl_flatten_group_local_21_direct =
+    sycl_flatten_group_local_direct<2, 1>;
+
+using sycl_flatten_group_local_012_direct =
+    sycl_flatten_group_local_direct<0, 1, 2>;
+using sycl_flatten_group_local_021_direct =
+    sycl_flatten_group_local_direct<0, 2, 1>;
+using sycl_flatten_group_local_102_direct =
+    sycl_flatten_group_local_direct<1, 0, 2>;
+using sycl_flatten_group_local_120_direct =
+    sycl_flatten_group_local_direct<1, 2, 0>;
+using sycl_flatten_group_local_201_direct =
+    sycl_flatten_group_local_direct<2, 0, 1>;
+using sycl_flatten_group_local_210_direct =
+    sycl_flatten_group_local_direct<2, 1, 0>;
+
+template <int... dim>
+struct sycl_flatten_group_local_loop
+{};
+
+using sycl_flatten_group_local_01_loop = sycl_flatten_group_local_loop<0, 1>;
+using sycl_flatten_group_local_02_loop = sycl_flatten_group_local_loop<0, 2>;
+using sycl_flatten_group_local_10_loop = sycl_flatten_group_local_loop<1, 0>;
+using sycl_flatten_group_local_12_loop = sycl_flatten_group_local_loop<1, 2>;
+using sycl_flatten_group_local_20_loop = sycl_flatten_group_local_loop<2, 0>;
+using sycl_flatten_group_local_21_loop = sycl_flatten_group_local_loop<2, 1>;
+
+using sycl_flatten_group_local_012_loop =
+    sycl_flatten_group_local_loop<0, 1, 2>;
+using sycl_flatten_group_local_021_loop =
+    sycl_flatten_group_local_loop<0, 2, 1>;
+using sycl_flatten_group_local_102_loop =
+    sycl_flatten_group_local_loop<1, 0, 2>;
+using sycl_flatten_group_local_120_loop =
+    sycl_flatten_group_local_loop<1, 2, 0>;
+using sycl_flatten_group_local_201_loop =
+    sycl_flatten_group_local_loop<2, 0, 1>;
+using sycl_flatten_group_local_210_loop =
+    sycl_flatten_group_local_loop<2, 1, 0>;
+
+template <typename SEGMENT, int DIM0, int DIM1>
 struct LoopExecute<sycl_flatten_group_local_direct<DIM0, DIM1>, SEGMENT>
 {
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
     {
-      const int tx = ctx.itm->get_local_id(DIM0);
-      const int ty = ctx.itm->get_local_id(DIM1);
-      const int bx = ctx.itm->get_local_range(DIM0);
-      const int tid = tx + bx*ty;
+      const int tx  = ctx.itm->get_local_id(DIM0);
+      const int ty  = ctx.itm->get_local_id(DIM1);
+      const int bx  = ctx.itm->get_local_range(DIM0);
+      const int tid = tx + bx * ty;
 
       if (tid < len) body(*(segment.begin() + tid));
     }
   }
 };
 
-template<typename SEGMENT, int DIM0, int DIM1>
+template <typename SEGMENT, int DIM0, int DIM1>
 struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1>, SEGMENT>
 {
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -522,21 +587,19 @@ struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1>, SEGMENT>
     const int bx = ctx.itm->get_local_range(DIM0);
     const int by = ctx.itm->get_local_range(DIM1);
 
-    for(int tid = tx + bx*ty; tid < len; tid += bx*by) {
+    for (int tid = tx + bx * ty; tid < len; tid += bx * by)
+    {
       body(*(segment.begin() + tid));
     }
-
   }
 };
 
-template<typename SEGMENT, int DIM0, int DIM1, int DIM2>
+template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
 struct LoopExecute<sycl_flatten_group_local_direct<DIM0, DIM1, DIM2>, SEGMENT>
 {
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
     const int len = segment.end() - segment.begin();
     {
@@ -546,21 +609,19 @@ struct LoopExecute<sycl_flatten_group_local_direct<DIM0, DIM1, DIM2>, SEGMENT>
       const int bx = ctx.itm->get_local_range(DIM0);
       const int by = ctx.itm->get_local_range(DIM1);
 
-      const int tid = tx + bx*(ty + by*tz);
+      const int tid = tx + bx * (ty + by * tz);
 
       if (tid < len) body(*(segment.begin() + tid));
     }
   }
 };
 
-template<typename SEGMENT, int DIM0, int DIM1, int DIM2>
+template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
 struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1, DIM2>, SEGMENT>
 {
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -571,10 +632,10 @@ struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1, DIM2>, SEGMENT>
     const int by = ctx.itm->get_local_range(DIM1);
     const int bz = ctx.itm->get_local_range(DIM2);
 
-    for(int tid = tx + bx*(ty + by*tz); tid < len; tid += bx*by*bz) {
+    for (int tid = tx + bx * (ty + by * tz); tid < len; tid += bx * by * bz)
+    {
       body(*(segment.begin() + tid));
     }
-
   }
 };
 
@@ -582,19 +643,17 @@ struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1, DIM2>, SEGMENT>
   SYCL thread loops with block strides
 */
 template <typename SEGMENT, int DIM>
-struct LoopExecute<sycl_local_012_loop<DIM>, SEGMENT> {
+struct LoopExecute<sycl_local_012_loop<DIM>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_local_id(DIM);
-         tx < len;
+    for (int tx = ctx.itm->get_local_id(DIM); tx < len;
          tx += ctx.itm->get_local_range(DIM))
     {
       body(*(segment.begin() + tx));
@@ -606,13 +665,12 @@ struct LoopExecute<sycl_local_012_loop<DIM>, SEGMENT> {
   SYCL thread direct mappings
 */
 template <typename SEGMENT, int DIM>
-struct LoopExecute<sycl_local_012_direct<DIM>, SEGMENT> {
+struct LoopExecute<sycl_local_012_direct<DIM>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -627,20 +685,19 @@ struct LoopExecute<sycl_local_012_direct<DIM>, SEGMENT> {
   SYCL block loops with grid strides
 */
 template <typename SEGMENT, int DIM>
-struct LoopExecute<sycl_group_012_loop<DIM>, SEGMENT> {
+struct LoopExecute<sycl_group_012_loop<DIM>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int bx = ctx.itm->get_group(DIM);
-         bx < len;
-         bx += ctx.itm->get_group_range(DIM) ) {
+    for (int bx = ctx.itm->get_group(DIM); bx < len;
+         bx += ctx.itm->get_group_range(DIM))
+    {
       body(*(segment.begin() + bx));
     }
   }
@@ -650,13 +707,12 @@ struct LoopExecute<sycl_group_012_loop<DIM>, SEGMENT> {
   SYCL block direct mappings
 */
 template <typename SEGMENT, int DIM>
-struct LoopExecute<sycl_group_012_direct<DIM>, SEGMENT> {
+struct LoopExecute<sycl_group_012_direct<DIM>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -671,20 +727,18 @@ struct LoopExecute<sycl_group_012_direct<DIM>, SEGMENT> {
   SYCL thread loops with block strides + Return Index
 */
 template <typename SEGMENT, int DIM>
-struct LoopICountExecute<sycl_local_012_loop<DIM>, SEGMENT> {
+struct LoopICountExecute<sycl_local_012_loop<DIM>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_local_id(DIM);
-         tx < len;
-         tx += ctx.itm->get_local_range(DIM) )
+    for (int tx = ctx.itm->get_local_id(DIM); tx < len;
+         tx += ctx.itm->get_local_range(DIM))
     {
       body(*(segment.begin() + tx), tx);
     }
@@ -695,13 +749,12 @@ struct LoopICountExecute<sycl_local_012_loop<DIM>, SEGMENT> {
   SYCL thread direct mappings
 */
 template <typename SEGMENT, int DIM>
-struct LoopICountExecute<sycl_local_012_direct<DIM>, SEGMENT> {
+struct LoopICountExecute<sycl_local_012_direct<DIM>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -716,20 +769,19 @@ struct LoopICountExecute<sycl_local_012_direct<DIM>, SEGMENT> {
   SYCL block loops with grid strides
 */
 template <typename SEGMENT, int DIM>
-struct LoopICountExecute<sycl_group_012_loop<DIM>, SEGMENT> {
+struct LoopICountExecute<sycl_group_012_loop<DIM>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int bx =  ctx.itm->get_group(DIM);
-         bx < len;
-         bx += ctx.itm->get_group_range(DIM) ) {
+    for (int bx = ctx.itm->get_group(DIM); bx < len;
+         bx += ctx.itm->get_group_range(DIM))
+    {
       body(*(segment.begin() + bx), bx);
     }
   }
@@ -739,13 +791,12 @@ struct LoopICountExecute<sycl_group_012_loop<DIM>, SEGMENT> {
   SYCL block direct mappings
 */
 template <typename SEGMENT, int DIM>
-struct LoopICountExecute<sycl_group_012_direct<DIM>, SEGMENT> {
+struct LoopICountExecute<sycl_group_012_direct<DIM>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -757,29 +808,29 @@ struct LoopICountExecute<sycl_group_012_direct<DIM>, SEGMENT> {
 };
 
 // perfectly nested sycl direct policies
-using sycl_group_01_nested_direct = sycl_group_012_direct<0,1>;
-using sycl_group_02_nested_direct = sycl_group_012_direct<0,2>;
-using sycl_group_10_nested_direct = sycl_group_012_direct<1,0>;
-using sycl_group_12_nested_direct = sycl_group_012_direct<1,2>;
-using sycl_group_20_nested_direct = sycl_group_012_direct<2,0>;
-using sycl_group_21_nested_direct = sycl_group_012_direct<2,1>;
-
-using sycl_group_012_nested_direct = sycl_group_012_direct<0,1,2>;
-using sycl_group_021_nested_direct = sycl_group_012_direct<0,2,1>;
-using sycl_group_102_nested_direct = sycl_group_012_direct<1,0,2>;
-using sycl_group_120_nested_direct = sycl_group_012_direct<1,2,0>;
-using sycl_group_201_nested_direct = sycl_group_012_direct<2,0,1>;
-using sycl_group_210_nested_direct = sycl_group_012_direct<2,1,0>;
+using sycl_group_01_nested_direct = sycl_group_012_direct<0, 1>;
+using sycl_group_02_nested_direct = sycl_group_012_direct<0, 2>;
+using sycl_group_10_nested_direct = sycl_group_012_direct<1, 0>;
+using sycl_group_12_nested_direct = sycl_group_012_direct<1, 2>;
+using sycl_group_20_nested_direct = sycl_group_012_direct<2, 0>;
+using sycl_group_21_nested_direct = sycl_group_012_direct<2, 1>;
+
+using sycl_group_012_nested_direct = sycl_group_012_direct<0, 1, 2>;
+using sycl_group_021_nested_direct = sycl_group_012_direct<0, 2, 1>;
+using sycl_group_102_nested_direct = sycl_group_012_direct<1, 0, 2>;
+using sycl_group_120_nested_direct = sycl_group_012_direct<1, 2, 0>;
+using sycl_group_201_nested_direct = sycl_group_012_direct<2, 0, 1>;
+using sycl_group_210_nested_direct = sycl_group_012_direct<2, 1, 0>;
 
 template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT> {
+struct LoopExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           BODY const& body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
@@ -793,15 +844,15 @@ struct LoopExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT> {
 };
 
 template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT> {
+struct LoopExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           SEGMENT const& segment2,
+                                           BODY const& body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -811,8 +862,7 @@ struct LoopExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT> {
       const int ty = ctx.itm->get_group(DIM1);
       const int tz = ctx.itm->get_group(DIM2);
       if (tx < len0 && ty < len1 && tz < len2)
-        body(*(segment0.begin() + tx),
-             *(segment1.begin() + ty),
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
              *(segment2.begin() + tz));
     }
   }
@@ -823,37 +873,36 @@ struct LoopExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT> {
   Return local index
 */
 template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT> {
+struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           BODY const& body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
-      const int tx =  ctx.itm->get_group(DIM0);
-      const int ty =  ctx.itm->get_group(DIM1);
+      const int tx = ctx.itm->get_group(DIM0);
+      const int ty = ctx.itm->get_group(DIM1);
       if (tx < len0 && ty < len1)
-        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
-             tx, ty);
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty), tx, ty);
     }
   }
 };
 
 template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT> {
+struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           SEGMENT const& segment2,
+                                           BODY const& body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -863,48 +912,45 @@ struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT> {
       const int ty = ctx.itm->get_group(DIM1);
       const int tz = ctx.itm->get_group(DIM2);
       if (tx < len0 && ty < len1 && tz < len2)
-        body(*(segment0.begin() + tx),
-             *(segment1.begin() + ty),
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
              *(segment2.begin() + tz), tx, ty, tz);
     }
   }
 };
 
 // perfectly nested sycl loop policies
-using sycl_group_01_nested_loop = sycl_group_012_loop<0,1>;
-using sycl_group_02_nested_loop = sycl_group_012_loop<0,2>;
-using sycl_group_10_nested_loop = sycl_group_012_loop<1,0>;
-using sycl_group_12_nested_loop = sycl_group_012_loop<1,2>;
-using sycl_group_20_nested_loop = sycl_group_012_loop<2,0>;
-using sycl_group_21_nested_loop = sycl_group_012_loop<2,1>;
-
-using sycl_group_012_nested_loop = sycl_group_012_loop<0,1,2>;
-using sycl_group_021_nested_loop = sycl_group_012_loop<0,2,1>;
-using sycl_group_102_nested_loop = sycl_group_012_loop<1,0,2>;
-using sycl_group_120_nested_loop = sycl_group_012_loop<1,2,0>;
-using sycl_group_201_nested_loop = sycl_group_012_loop<2,0,1>;
-using sycl_group_210_nested_loop = sycl_group_012_loop<2,1,0>;
+using sycl_group_01_nested_loop = sycl_group_012_loop<0, 1>;
+using sycl_group_02_nested_loop = sycl_group_012_loop<0, 2>;
+using sycl_group_10_nested_loop = sycl_group_012_loop<1, 0>;
+using sycl_group_12_nested_loop = sycl_group_012_loop<1, 2>;
+using sycl_group_20_nested_loop = sycl_group_012_loop<2, 0>;
+using sycl_group_21_nested_loop = sycl_group_012_loop<2, 1>;
+
+using sycl_group_012_nested_loop = sycl_group_012_loop<0, 1, 2>;
+using sycl_group_021_nested_loop = sycl_group_012_loop<0, 2, 1>;
+using sycl_group_102_nested_loop = sycl_group_012_loop<1, 0, 2>;
+using sycl_group_120_nested_loop = sycl_group_012_loop<1, 2, 0>;
+using sycl_group_201_nested_loop = sycl_group_012_loop<2, 0, 1>;
+using sycl_group_210_nested_loop = sycl_group_012_loop<2, 1, 0>;
 
 template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT> {
+struct LoopExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           BODY const& body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
 
-      for (int bx = ctx.itm->get_group(DIM0);
-           bx < len0;
+      for (int bx = ctx.itm->get_group(DIM0); bx < len0;
            bx += ctx.itm->get_group_range(DIM0))
       {
-        for (int by = ctx.itm->get_group(DIM1);
-             by < len1;
+        for (int by = ctx.itm->get_group(DIM1); by < len1;
              bx += ctx.itm->get_group_range(DIM1))
         {
           body(*(segment0.begin() + bx), *(segment1.begin() + by));
@@ -915,37 +961,33 @@ struct LoopExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT> {
 };
 
 template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT> {
+struct LoopExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           SEGMENT const& segment2,
+                                           BODY const& body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int bx = ctx.itm->get_group(DIM0);
-         bx < len0;
+    for (int bx = ctx.itm->get_group(DIM0); bx < len0;
          bx += ctx.itm->get_group_range(DIM0))
     {
 
-      for (int by = ctx.itm->get_group(DIM1);
-           by < len1;
+      for (int by = ctx.itm->get_group(DIM1); by < len1;
            by += ctx.itm->get_group_range(DIM1))
       {
 
-        for (int bz = ctx.itm->get_group(DIM2);
-             bz < len2;
+        for (int bz = ctx.itm->get_group(DIM2); bz < len2;
              bz += ctx.itm->get_group_range(DIM2))
         {
 
-          body(*(segment0.begin() + bx),
-               *(segment1.begin() + by),
+          body(*(segment0.begin() + bx), *(segment1.begin() + by),
                *(segment2.begin() + bz));
         }
       }
@@ -957,25 +999,23 @@ struct LoopExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT> {
   perfectly nested sycl loop policies + returns local index
 */
 template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT> {
+struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           BODY const& body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
 
-      for (int bx = ctx.itm->get_group(DIM0);
-           bx < len0;
+      for (int bx = ctx.itm->get_group(DIM0); bx < len0;
            bx += ctx.itm->get_group_range(DIM0))
       {
-        for (int by = ctx.itm->get_group(DIM0);
-             by < len1;
+        for (int by = ctx.itm->get_group(DIM0); by < len1;
              by += ctx.itm->get_group_range(DIM1))
         {
 
@@ -987,37 +1027,33 @@ struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT> {
 };
 
 template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT> {
+struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           SEGMENT const& segment2,
+                                           BODY const& body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int bx = ctx.itm->get_group(DIM0);
-         bx < len0;
+    for (int bx = ctx.itm->get_group(DIM0); bx < len0;
          bx += ctx.itm->get_group_range(DIM0))
     {
 
-      for (int by = ctx.itm->get_group(DIM0);
-           by < len1;
+      for (int by = ctx.itm->get_group(DIM0); by < len1;
            by += ctx.itm->get_group_range(DIM0))
       {
 
-        for (int bz =  ctx.itm->get_group(DIM0);
-             bz < len2;
+        for (int bz = ctx.itm->get_group(DIM0); bz < len2;
              bz += ctx.itm->get_group_range(DIM0))
         {
 
-          body(*(segment0.begin() + bx),
-               *(segment1.begin() + by),
+          body(*(segment0.begin() + bx), *(segment1.begin() + by),
                *(segment2.begin() + bz), bx, by, bz);
         }
       }
@@ -1026,20 +1062,19 @@ struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT> {
 };
 
 template <typename SEGMENT, int DIM>
-struct TileExecute<sycl_local_012_loop<DIM>, SEGMENT> {
+struct TileExecute<sycl_local_012_loop<DIM>, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_local_id(DIM) * tile_size;
-         tx < len;
+    for (int tx = ctx.itm->get_local_id(DIM) * tile_size; tx < len;
          tx += ctx.itm->get_local_range(DIM) * tile_size)
     {
       body(segment.slice(tx, tile_size));
@@ -1049,20 +1084,20 @@ struct TileExecute<sycl_local_012_loop<DIM>, SEGMENT> {
 
 
 template <typename SEGMENT, int DIM>
-struct TileExecute<sycl_local_012_direct<DIM>, SEGMENT> {
+struct TileExecute<sycl_local_012_direct<DIM>, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
     int tx = ctx.itm->get_local_id(DIM) * tile_size;
-    if(tx < len)
+    if (tx < len)
     {
       body(segment.slice(tx, tile_size));
     }
@@ -1071,19 +1106,19 @@ struct TileExecute<sycl_local_012_direct<DIM>, SEGMENT> {
 
 
 template <typename SEGMENT, int DIM>
-struct TileExecute<sycl_group_012_loop<DIM>, SEGMENT> {
+struct TileExecute<sycl_group_012_loop<DIM>, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_group(DIM)* tile_size;
+    for (int tx = ctx.itm->get_group(DIM) * tile_size;
 
          tx < len;
 
@@ -1095,110 +1130,110 @@ struct TileExecute<sycl_group_012_loop<DIM>, SEGMENT> {
 };
 
 template <typename SEGMENT, int DIM>
-struct TileExecute<sycl_group_012_direct<DIM>, SEGMENT> {
+struct TileExecute<sycl_group_012_direct<DIM>, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
     int tx = ctx.itm->get_group(DIM) * tile_size;
-    if(tx < len){
+    if (tx < len)
+    {
       body(segment.slice(tx, tile_size));
     }
   }
 };
 
-//Tile execute + return index
+// Tile execute + return index
 template <typename SEGMENT, int DIM>
-struct TileTCountExecute<sycl_local_012_loop<DIM>, SEGMENT> {
+struct TileTCountExecute<sycl_local_012_loop<DIM>, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_local_id(DIM) * tile_size;
-         tx < len;
+    for (int tx = ctx.itm->get_local_id(DIM) * tile_size; tx < len;
          tx += ctx.itm->get_local_range(DIM) * tile_size)
     {
-      body(segment.slice(tx, tile_size), tx/tile_size);
+      body(segment.slice(tx, tile_size), tx / tile_size);
     }
   }
 };
 
 
 template <typename SEGMENT, int DIM>
-struct TileTCountExecute<sycl_local_012_direct<DIM>, SEGMENT> {
+struct TileTCountExecute<sycl_local_012_direct<DIM>, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
     int tx = ctx.itm->get_local_id(DIM) * tile_size;
-    if(tx < len)
+    if (tx < len)
     {
-      body(segment.slice(tx, tile_size), tx/tile_size);
+      body(segment.slice(tx, tile_size), tx / tile_size);
     }
   }
 };
 
 
 template <typename SEGMENT, int DIM>
-struct TileTCountExecute<sycl_group_012_loop<DIM>, SEGMENT> {
+struct TileTCountExecute<sycl_group_012_loop<DIM>, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int bx = ctx.itm->get_group(DIM) * tile_size;
-         bx < len;
+    for (int bx = ctx.itm->get_group(DIM) * tile_size; bx < len;
          bx += ctx.itm->get_group_range(DIM) * tile_size)
     {
-      body(segment.slice(bx, tile_size), bx/tile_size);
+      body(segment.slice(bx, tile_size), bx / tile_size);
     }
   }
 };
 
 
 template <typename SEGMENT, int DIM>
-struct TileTCountExecute<sycl_group_012_direct<DIM>, SEGMENT> {
+struct TileTCountExecute<sycl_group_012_direct<DIM>, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
     int bx = ctx.itm->get_group(DIM) * tile_size;
-    if(bx < len){
-      body(segment.slice(bx, tile_size), bx/tile_size);
+    if (bx < len)
+    {
+      body(segment.slice(bx, tile_size), bx / tile_size);
     }
   }
 };
diff --git a/include/RAJA/policy/sycl/params/kernel_name.hpp b/include/RAJA/policy/sycl/params/kernel_name.hpp
index 1f33be19bb..149d4ca0fd 100644
--- a/include/RAJA/policy/sycl/params/kernel_name.hpp
+++ b/include/RAJA/policy/sycl/params/kernel_name.hpp
@@ -3,39 +3,42 @@
 
 #include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
-
-#if defined(RAJA_ENABLE_SYCL)  
-  
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  init(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename T>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  SYCL_EXTERNAL
-  combine(KernelName&, T) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  resolve(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
-
-#endif  
-
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
-
-
-#endif //  NEW_REDUCE_SYCL_REDUCE_HPP
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+#if defined(RAJA_ENABLE_SYCL)
+
+// Init
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>>
+init(KernelName&)
+{
+  // TODO: Define kernel naming
+}
+
+// Combine
+template <typename EXEC_POL, typename T>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>>
+    SYCL_EXTERNAL combine(KernelName&, T)
+{}
+
+// Resolve
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>>
+resolve(KernelName&)
+{
+  // TODO: Define kernel naming
+}
+
+#endif
+
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
+
+
+#endif  //  NEW_REDUCE_SYCL_REDUCE_HPP
diff --git a/include/RAJA/policy/sycl/params/reduce.hpp b/include/RAJA/policy/sycl/params/reduce.hpp
index e2fb7e1a5a..6a381d709a 100644
--- a/include/RAJA/policy/sycl/params/reduce.hpp
+++ b/include/RAJA/policy/sycl/params/reduce.hpp
@@ -3,37 +3,43 @@
 
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
 #if defined(RAJA_ENABLE_SYCL)
 
-  // Init
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  init(Reducer<OP, T, VOp>& red) {
-    red.m_valop.val = OP::identity();
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in) {
-    out.m_valop.val = OP{}(out.m_valop.val, in.m_valop.val);
-  }
-
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  resolve(Reducer<OP, T, VOp>& red) {
-    red.combineTarget(red.m_valop.val);
-  }
+// Init
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>>
+init(Reducer<OP, T, VOp>& red)
+{
+  red.m_valop.val = OP::identity();
+}
+
+// Combine
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>>
+combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in)
+{
+  out.m_valop.val = OP {}(out.m_valop.val, in.m_valop.val);
+}
+
+// Resolve
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>>
+resolve(Reducer<OP, T, VOp>& red)
+{
+  red.combineTarget(red.m_valop.val);
+}
 
 #endif
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
-#endif //  NEW_REDUCE_SYCL_REDUCE_HPP
+#endif  //  NEW_REDUCE_SYCL_REDUCE_HPP
diff --git a/include/RAJA/policy/sycl/policy.hpp b/include/RAJA/policy/sycl/policy.hpp
index 0f92fe27e1..afd7c24b22 100644
--- a/include/RAJA/policy/sycl/policy.hpp
+++ b/include/RAJA/policy/sycl/policy.hpp
@@ -35,7 +35,8 @@
 namespace RAJA
 {
 
-struct uint3 {
+struct uint3
+{
   unsigned long x, y, z;
 };
 
@@ -46,12 +47,14 @@ using sycl_dim_3_t = uint3;
 namespace detail
 {
 template <bool Async>
-struct get_launch {
+struct get_launch
+{
   static constexpr RAJA::Launch value = RAJA::Launch::async;
 };
 
 template <>
-struct get_launch<false> {
+struct get_launch<false>
+{
   static constexpr RAJA::Launch value = RAJA::Launch::sync;
 };
 }  // end namespace detail
@@ -73,27 +76,28 @@ struct sycl_exec : public RAJA::make_policy_pattern_launch_platform_t<
                        RAJA::Policy::sycl,
                        RAJA::Pattern::forall,
                        detail::get_launch<Async>::value,
-                       RAJA::Platform::sycl> {
-};
+                       RAJA::Platform::sycl>
+{};
 
 template <bool Async, int num_threads = 0>
 struct sycl_launch_t : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::sycl,
-                       RAJA::Pattern::region,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::sycl> {
-};
+                           RAJA::Policy::sycl,
+                           RAJA::Pattern::region,
+                           detail::get_launch<Async>::value,
+                           RAJA::Platform::sycl>
+{};
 
 struct sycl_reduce
-    : make_policy_pattern_t<RAJA::Policy::sycl, RAJA::Pattern::reduce> {
-};
+    : make_policy_pattern_t<RAJA::Policy::sycl, RAJA::Pattern::reduce>
+{};
 
 //
 // Sycl atomic policy for using sycl atomics on the device and
 // the provided Policy on the host
 //
-template<typename host_policy>
-struct sycl_atomic_explicit{};
+template <typename host_policy>
+struct sycl_atomic_explicit
+{};
 
 //
 // Default sycl atomic policy uses sycl atomics on the device and non-atomics
@@ -101,11 +105,13 @@ struct sycl_atomic_explicit{};
 //
 using sycl_atomic = sycl_atomic_explicit<seq_atomic>;
 
-template<typename Mask>
-struct sycl_local_masked_direct {};
+template <typename Mask>
+struct sycl_local_masked_direct
+{};
 
-template<typename Mask>
-struct sycl_local_masked_loop {};
+template <typename Mask>
+struct sycl_local_masked_loop
+{};
 
 }  // namespace sycl
 }  // namespace policy
@@ -120,27 +126,29 @@ using policy::sycl::sycl_local_masked_direct;
 using policy::sycl::sycl_local_masked_loop;
 
 using policy::sycl::sycl_launch_t;
-  
+
 /*!
  * Maps indices to SYCL global id
  * Optional WORK_GROUP_SIZE to
  */
-template<int dim, int WORK_GROUP_SIZE = 1>
-struct sycl_global_012{};
+template <int dim, int WORK_GROUP_SIZE = 1>
+struct sycl_global_012
+{};
 
-template<int WORK_GROUP_SIZE>
+template <int WORK_GROUP_SIZE>
 using sycl_global_0 = sycl_global_012<0, WORK_GROUP_SIZE>;
-template<int WORK_GROUP_SIZE>
+template <int WORK_GROUP_SIZE>
 using sycl_global_1 = sycl_global_012<1, WORK_GROUP_SIZE>;
-template<int WORK_GROUP_SIZE>
+template <int WORK_GROUP_SIZE>
 using sycl_global_2 = sycl_global_012<2, WORK_GROUP_SIZE>;
 
 /*!
  * Maps segment indices to SYCL group ids.
  * Loops to allow for any value
  */
-template<int ... dim>
-struct sycl_group_012_loop{};
+template <int... dim>
+struct sycl_group_012_loop
+{};
 
 using sycl_group_0_loop = sycl_group_012_loop<0>;
 using sycl_group_1_loop = sycl_group_012_loop<1>;
@@ -150,8 +158,9 @@ using sycl_group_2_loop = sycl_group_012_loop<2>;
  * Maps segment indices to SYCL local ids.
  * Loops to allow for any value
  */
-template<int ... dim>
-struct sycl_local_012_loop{};
+template <int... dim>
+struct sycl_local_012_loop
+{};
 
 using sycl_local_0_loop = sycl_local_012_loop<0>;
 using sycl_local_1_loop = sycl_local_012_loop<1>;
@@ -160,8 +169,9 @@ using sycl_local_2_loop = sycl_local_012_loop<2>;
 /*!
  * Maps segment indices to SYCL group ids.
  */
-template<int ... dim>
-struct sycl_group_012_direct{};
+template <int... dim>
+struct sycl_group_012_direct
+{};
 
 using sycl_group_0_direct = sycl_group_012_direct<0>;
 using sycl_group_1_direct = sycl_group_012_direct<1>;
@@ -170,102 +180,87 @@ using sycl_group_2_direct = sycl_group_012_direct<2>;
 /*!
  * Maps segment indices to SYCL local ids.
  */
-template<int ... dim>
-struct sycl_local_012_direct{};
+template <int... dim>
+struct sycl_local_012_direct
+{};
 
 using sycl_local_0_direct = sycl_local_012_direct<0>;
 using sycl_local_1_direct = sycl_local_012_direct<1>;
 using sycl_local_2_direct = sycl_local_012_direct<2>;
 
 
-namespace internal{
+namespace internal
+{
 
-template<int dim>
+template <int dim>
 struct SyclDimHelper;
 
-template<>
-struct SyclDimHelper<0>{
+template <>
+struct SyclDimHelper<0>
+{
 
-  template<typename dim_t>
-  inline
-  static
-  constexpr
-  auto get(dim_t const &d) ->
-    decltype(d.x)
+  template <typename dim_t>
+  inline static constexpr auto get(dim_t const& d) -> decltype(d.x)
   {
     return d.x;
   }
 
-  template<typename dim_t>
-  inline
-  static
-  void set(dim_t &d, int value)
+  template <typename dim_t>
+  inline static void set(dim_t& d, int value)
   {
     d.x = value;
   }
 };
 
-template<>
-struct SyclDimHelper<1>{
+template <>
+struct SyclDimHelper<1>
+{
 
-  template<typename dim_t>
-  inline
-  static
-  constexpr
-  auto get(dim_t const &d) ->
-    decltype(d.x)
+  template <typename dim_t>
+  inline static constexpr auto get(dim_t const& d) -> decltype(d.x)
   {
     return d.y;
   }
 
-  template<typename dim_t>
-  inline
-  static
-  void set(dim_t &d, int value)
+  template <typename dim_t>
+  inline static void set(dim_t& d, int value)
   {
     d.y = value;
   }
 };
 
-template<>
-struct SyclDimHelper<2>{
+template <>
+struct SyclDimHelper<2>
+{
 
-  template<typename dim_t>
-  inline
-  static
-  constexpr
-  auto get(dim_t const &d) ->
-    decltype(d.x)
+  template <typename dim_t>
+  inline static constexpr auto get(dim_t const& d) -> decltype(d.x)
   {
     return d.z;
   }
 
-  template<typename dim_t>
-  inline
-  static
-  void set(dim_t &d, int value)
+  template <typename dim_t>
+  inline static void set(dim_t& d, int value)
   {
     d.z = value;
   }
 };
 
-template<int dim, typename dim_t>
-constexpr
-auto get_sycl_dim(dim_t const &d) ->
-  decltype(d.x)
+template <int dim, typename dim_t>
+constexpr auto get_sycl_dim(dim_t const& d) -> decltype(d.x)
 {
   return SyclDimHelper<dim>::get(d);
 }
 
-template<int dim, typename dim_t>
-void set_sycl_dim(dim_t &d, int value)
+template <int dim, typename dim_t>
+void set_sycl_dim(dim_t& d, int value)
 {
   return SyclDimHelper<dim>::set(d, value);
 }
-} // namespace internal
+}  // namespace internal
 
 }  // namespace RAJA
 
-#endif // RAJA_ENABLE_SYCL
+#endif  // RAJA_ENABLE_SYCL
 
 #endif
diff --git a/include/RAJA/policy/sycl/reduce.hpp b/include/RAJA/policy/sycl/reduce.hpp
index 49d89b3cd2..ac9690f42e 100644
--- a/include/RAJA/policy/sycl/reduce.hpp
+++ b/include/RAJA/policy/sycl/reduce.hpp
@@ -4,7 +4,7 @@
  * \file
  *
  * \brief   Header file for SYCL reduction stucts/classes.
- *          
+ *
  ******************************************************************************
  */
 
@@ -38,15 +38,14 @@ namespace sycl
 {
 
 template <typename T, typename I>
-struct minloc 
+struct minloc
 {
   static constexpr T identity = T(::RAJA::operators::limits<T>::max());
-  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val,
-                                               I &loc,
-                                               const T v,
-                                               const I l)
+  RAJA_HOST_DEVICE RAJA_INLINE void
+  operator()(T& val, I& loc, const T v, const I l)
   {
-    if (v < val) {
+    if (v < val)
+    {
       loc = l;
       val = v;
     }
@@ -54,15 +53,14 @@ struct minloc
 };
 
 template <typename T, typename I>
-struct maxloc 
+struct maxloc
 {
   static constexpr T identity = T(::RAJA::operators::limits<T>::min());
-  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val,
-                                               I &loc,
-                                               const T v,
-                                               const I l)
+  RAJA_HOST_DEVICE RAJA_INLINE void
+  operator()(T& val, I& loc, const T v, const I l)
   {
-    if (v > val) {
+    if (v > val)
+    {
       loc = l;
       val = v;
     }
@@ -74,18 +72,19 @@ struct maxloc
 static int MaxNumTeams = 1;
 
 //! Information necessary for SYCL offload to be considered
-struct Offload_Info 
+struct Offload_Info
 {
-  int hostID{1};
-  int deviceID{2};
-  bool isMapped{false};
+  int hostID {1};
+  int deviceID {2};
+  bool isMapped {false};
 
   Offload_Info() = default;
 
-  Offload_Info(const Offload_Info &other)
-      : hostID{other.hostID}, deviceID{other.deviceID}, isMapped{other.isMapped}
-  {
-  }
+  Offload_Info(const Offload_Info& other)
+      : hostID {other.hostID},
+        deviceID {other.deviceID},
+        isMapped {other.isMapped}
+  {}
 };
 
 //! Reduction data for SYCL Offload -- stores value, host pointer, and device
@@ -94,8 +93,8 @@ template <typename T>
 struct Reduce_Data
 {
   mutable T value;
-  T *device;
-  T *host;
+  T* device;
+  T* host;
 
   //! disallow default constructor
   Reduce_Data() = delete;
@@ -104,20 +103,24 @@ struct Reduce_Data
    *
    *  allocates data on the host and device and initializes values to default
    */
-  Reduce_Data(T initValue, T identityValue, Offload_Info &info)
+  Reduce_Data(T initValue, T identityValue, Offload_Info& info)
       : value(initValue)
   {
     cl::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
 
 
-    device = reinterpret_cast<T *>(cl::sycl::malloc_device(sycl::MaxNumTeams * sizeof(T), *(q)));
-    host = reinterpret_cast<T *>(cl::sycl::malloc_host(sycl::MaxNumTeams * sizeof(T), *(q)));
+    device = reinterpret_cast<T*>(
+        cl::sycl::malloc_device(sycl::MaxNumTeams * sizeof(T), *(q)));
+    host = reinterpret_cast<T*>(
+        cl::sycl::malloc_host(sycl::MaxNumTeams * sizeof(T), *(q)));
 
-    if (!host) {
+    if (!host)
+    {
       printf("Unable to allocate space on host\n");
       exit(1);
     }
-    if (!device) {
+    if (!device)
+    {
       printf("Unable to allocate space on device\n");
       exit(1);
     }
@@ -125,62 +128,63 @@ struct Reduce_Data
     hostToDevice(info);
   }
 
-  void reset(T initValue)
-  {
-    value = initValue;
-  }
+  void reset(T initValue) { value = initValue; }
 
   //! default copy constructor for POD
-  Reduce_Data(const Reduce_Data &) = default;
+  Reduce_Data(const Reduce_Data&) = default;
 
   //! transfers from the host to the device -- exit() is called upon failure
-  RAJA_INLINE void hostToDevice(Offload_Info &info)
+  RAJA_INLINE void hostToDevice(Offload_Info& info)
   {
     cl::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
 
-    if(!q) {
+    if (!q)
+    {
       camp::resources::Resource res = camp::resources::Sycl();
       q = res.get<camp::resources::Sycl>().get_queue();
     }
 
     // precondition: host and device are valid pointers
-    auto e = q->memcpy(reinterpret_cast<void *>(device),
-                       reinterpret_cast<void *>(host),
-                       sycl::MaxNumTeams * sizeof(T));
+    auto e =
+        q->memcpy(reinterpret_cast<void*>(device),
+                  reinterpret_cast<void*>(host), sycl::MaxNumTeams * sizeof(T));
 
     e.wait();
   }
 
   //! transfers from the device to the host -- exit() is called upon failure
-  RAJA_INLINE void deviceToHost(Offload_Info &info)
+  RAJA_INLINE void deviceToHost(Offload_Info& info)
   {
     cl::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
 
-    if(!q) {
+    if (!q)
+    {
       camp::resources::Resource res = camp::resources::Sycl();
       q = res.get<camp::resources::Sycl>().get_queue();
-    } 
+    }
 
     // precondition: host and device are valid pointers
-    auto e = q->memcpy(reinterpret_cast<void *>(host),
-                       reinterpret_cast<void *>(device),
+    auto e = q->memcpy(reinterpret_cast<void*>(host),
+                       reinterpret_cast<void*>(device),
                        sycl::MaxNumTeams * sizeof(T));
- 
+
     e.wait();
   }
 
   //! frees all data from the offload information passed
-  RAJA_INLINE void cleanup(Offload_Info &info)
+  RAJA_INLINE void cleanup(Offload_Info& info)
   {
     cl::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
 
-    if (device) {
-      cl::sycl::free(reinterpret_cast<void *>(device), *q);
+    if (device)
+    {
+      cl::sycl::free(reinterpret_cast<void*>(device), *q);
       device = nullptr;
     }
-    if (host) {
-      cl::sycl::free(reinterpret_cast<void *>(host), *q);
-      //delete[] host;
+    if (host)
+    {
+      cl::sycl::free(reinterpret_cast<void*>(host), *q);
+      // delete[] host;
       host = nullptr;
     }
   }
@@ -191,47 +195,46 @@ struct Reduce_Data
 //! SYCL Target Reduction entity -- generalize on # of teams, reduction, and
 //! type
 template <typename Reducer, typename T>
-struct TargetReduce 
+struct TargetReduce
 {
-  TargetReduce() = delete;
-  TargetReduce(const TargetReduce &) = default;
+  TargetReduce()                    = delete;
+  TargetReduce(const TargetReduce&) = default;
 
   explicit TargetReduce(T init_val)
       : info(),
         val(Reducer::identity(), Reducer::identity(), info),
         initVal(init_val),
         finalVal(Reducer::identity())
-  {
-  }
+  {}
 
   void reset(T init_val_, T identity_ = Reducer::identity())
   {
     val.cleanup(info);
-    val = sycl::Reduce_Data<T>(identity_, identity_, info);
+    val           = sycl::Reduce_Data<T>(identity_, identity_, info);
     info.isMapped = false;
-    initVal = init_val_;
-    finalVal = identity_;
+    initVal       = init_val_;
+    finalVal      = identity_;
   }
 
   //! apply reduction on device upon destruction
-  ~TargetReduce()
-  {
-  }
+  ~TargetReduce() {}
 
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    if (!info.isMapped) {
+    if (!info.isMapped)
+    {
       val.deviceToHost(info);
-      for (int i =0; i < sycl::MaxNumTeams; ++i) {
-        Reducer{}(val.value, val.host[i]);
+      for (int i = 0; i < sycl::MaxNumTeams; ++i)
+      {
+        Reducer {}(val.value, val.host[i]);
       }
-//      val.cleanup(info);
+      //      val.cleanup(info);
       info.isMapped = true;
     }
     finalVal = Reducer::identity();
-    Reducer{}(finalVal, initVal);
-    Reducer{}(finalVal, val.value);
+    Reducer {}(finalVal, initVal);
+    Reducer {}(finalVal, val.value);
     T returnVal = finalVal;
     reset(finalVal);
     return returnVal;
@@ -240,29 +243,37 @@ struct TargetReduce
   T get() { return operator T(); }
 
   //! apply reduction
-  TargetReduce &reduce(T rhsVal)
+  TargetReduce& reduce(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(val.device[i]);
-    Reducer{}(atm, rhsVal);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            val.device[i]);
+    Reducer {}(atm, rhsVal);
     return *this;
 #else
-    Reducer{}(val.value, rhsVal);
+    Reducer {}(val.value, rhsVal);
     return *this;
 #endif
   }
 
   //! apply reduction (const version) -- still reduces internal values
-  const TargetReduce &reduce(T rhsVal) const
+  const TargetReduce& reduce(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(val.device[i]);
-    Reducer{}(atm, rhsVal);  
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            val.device[i]);
+    Reducer {}(atm, rhsVal);
     return *this;
 #else
-    Reducer{}(val.value, rhsVal);
+    Reducer {}(val.value, rhsVal);
     return *this;
 #endif
   }
@@ -281,13 +292,16 @@ struct TargetReduce
 //! SYCL Target Reduction Location entity -- generalize on # of teams,
 //! reduction, and type
 template <typename Reducer, typename T, typename IndexType>
-struct TargetReduceLoc 
+struct TargetReduceLoc
 {
-  TargetReduceLoc() = delete;
-  TargetReduceLoc(const TargetReduceLoc &) = default;
-  explicit TargetReduceLoc(T init_val, IndexType init_loc,
-                           T identity_val_ = Reducer::identity,
-                           IndexType identity_loc_ = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+  TargetReduceLoc()                       = delete;
+  TargetReduceLoc(const TargetReduceLoc&) = default;
+  explicit TargetReduceLoc(
+      T init_val,
+      IndexType init_loc,
+      T identity_val_ = Reducer::identity,
+      IndexType identity_loc_ =
+          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
       : info(),
         val(identity_val_, identity_val_, info),
         loc(identity_loc_, identity_loc_, info),
@@ -295,45 +309,46 @@ struct TargetReduceLoc
         finalVal(identity_val_),
         initLoc(init_loc),
         finalLoc(identity_loc_)
-  {
-  }
+  {}
 
-  void reset(T init_val_, IndexType init_loc_,
+  void reset(T init_val_,
+             IndexType init_loc_,
              T identity_val_ = Reducer::identity,
-             IndexType identity_loc_ = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_loc_ =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
     val.cleanup(info);
     val = sycl::Reduce_Data<T>(identity_val_, identity_val_, info);
     loc.cleanup(info);
     loc = sycl::Reduce_Data<IndexType>(identity_loc_, identity_loc_, info);
     info.isMapped = false;
-    initVal = init_val_;
-    finalVal = identity_val_;
-    initLoc = init_loc_;
-    finalLoc = identity_loc_;
+    initVal       = init_val_;
+    finalVal      = identity_val_;
+    initLoc       = init_loc_;
+    finalLoc      = identity_loc_;
   }
 
   //! apply reduction on device upon destruction
-  ~TargetReduceLoc()
-  {
-  }
+  ~TargetReduceLoc() {}
 
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    if (!info.isMapped) {
+    if (!info.isMapped)
+    {
       val.deviceToHost(info);
       loc.deviceToHost(info);
-      
-      for (int i = 0; i < sycl::MaxNumTeams; ++i) {
-        Reducer{}(val.value, loc.value, val.host[i], loc.host[i]);
+
+      for (int i = 0; i < sycl::MaxNumTeams; ++i)
+      {
+        Reducer {}(val.value, loc.value, val.host[i], loc.host[i]);
       }
       info.isMapped = true;
     }
     finalVal = Reducer::identity;
     finalLoc = IndexType(RAJA::reduce::detail::DefaultLoc<IndexType>().value());
-    Reducer{}(finalVal, finalLoc, initVal, initLoc);
-    Reducer{}(finalVal, finalLoc, val.value, loc.value);
+    Reducer {}(finalVal, finalLoc, initVal, initLoc);
+    Reducer {}(finalVal, finalLoc, val.value, loc.value);
     returnVal = finalVal;
     returnLoc = finalLoc;
     reset(finalVal, finalLoc);
@@ -353,24 +368,26 @@ struct TargetReduceLoc
   }
 
   //! apply reduction
-  TargetReduceLoc &reduce(T rhsVal, IndexType rhsLoc)
+  TargetReduceLoc& reduce(T rhsVal, IndexType rhsLoc)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    cl::sycl::atomic_fence(cl::sycl::memory_order_acquire, cl::sycl::memory_scope::device);
-    Reducer{}(val.device[i], loc.device[i], rhsVal, rhsLoc);
-    cl::sycl::atomic_fence(cl::sycl::memory_order_release, cl::sycl::memory_scope::device);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    cl::sycl::atomic_fence(cl::sycl::memory_order_acquire,
+                           cl::sycl::memory_scope::device);
+    Reducer {}(val.device[i], loc.device[i], rhsVal, rhsLoc);
+    cl::sycl::atomic_fence(cl::sycl::memory_order_release,
+                           cl::sycl::memory_scope::device);
     return *this;
 #else
-    Reducer{}(val.value, loc.value, rhsVal, rhsLoc);
+    Reducer {}(val.value, loc.value, rhsVal, rhsLoc);
     return *this;
 #endif
   }
 
   //! apply reduction (const version) -- still reduces internal values
-  const TargetReduceLoc &reduce(T rhsVal, IndexType rhsLoc) const
+  const TargetReduceLoc& reduce(T rhsVal, IndexType rhsLoc) const
   {
-    Reducer{}(val.value, loc.value, rhsVal, rhsLoc);
+    Reducer {}(val.value, loc.value, rhsVal, rhsLoc);
     return *this;
   }
 
@@ -382,7 +399,7 @@ struct TargetReduceLoc
   //! storage for offload information
   sycl::Offload_Info info;
   //! storage for reduction data for value
-//  sycl::Reduce_Data<T> val;
+  //  sycl::Reduce_Data<T> val;
   //! storage for redcution data for location
   T initVal;
   T finalVal;
@@ -395,28 +412,30 @@ struct TargetReduceLoc
 
 //! specialization of ReduceSum for omp_target_reduce
 template <typename T>
-class ReduceSum<sycl_reduce, T>
-    : public TargetReduce<RAJA::reduce::sum<T>, T>
+class ReduceSum<sycl_reduce, T> : public TargetReduce<RAJA::reduce::sum<T>, T>
 {
 public:
-
-  using self = ReduceSum<sycl_reduce, T>;
+  using self   = ReduceSum<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::sum<T>, T>;
   using parent::parent;
 
   //! enable operator+= for ReduceSum -- alias for reduce()
-  self &operator+=(T rhsVal)
+  self& operator+=(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable operator+= for ReduceSum -- alias for reduce()
-  const self &operator+=(T rhsVal) const
+  const self& operator+=(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm.fetch_add(rhsVal);
     return *this;
 #else
@@ -432,17 +451,20 @@ class ReduceBitOr<sycl_reduce, T>
     : public TargetReduce<RAJA::reduce::or_bit<T>, T>
 {
 public:
-
-  using self = ReduceBitOr<sycl_reduce, T>;
+  using self   = ReduceBitOr<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::or_bit<T>, T>;
   using parent::parent;
 
   //! enable operator|= for ReduceBitOr -- alias for reduce()
-  self &operator|=(T rhsVal)
+  self& operator|=(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm |= rhsVal;
     return *this;
 #else
@@ -452,11 +474,15 @@ class ReduceBitOr<sycl_reduce, T>
   }
 
   //! enable operator|= for ReduceBitOr -- alias for reduce()
-  const self &operator|=(T rhsVal) const
+  const self& operator|=(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm |= rhsVal;
     return *this;
 #else
@@ -472,17 +498,20 @@ class ReduceBitAnd<sycl_reduce, T>
     : public TargetReduce<RAJA::reduce::and_bit<T>, T>
 {
 public:
-
-  using self = ReduceBitAnd<sycl_reduce, T>;
+  using self   = ReduceBitAnd<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::and_bit<T>, T>;
   using parent::parent;
 
   //! enable operator&= for ReduceBitAnd -- alias for reduce()
-  self &operator&=(T rhsVal)
+  self& operator&=(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm &= rhsVal;
     return *this;
 #else
@@ -492,11 +521,15 @@ class ReduceBitAnd<sycl_reduce, T>
   }
 
   //! enable operator&= for ReduceBitAnd -- alias for reduce()
-  const self &operator&=(T rhsVal) const
+  const self& operator&=(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm &= rhsVal;
     return *this;
 #else
@@ -509,21 +542,23 @@ class ReduceBitAnd<sycl_reduce, T>
 
 //! specialization of ReduceMin for omp_target_reduce
 template <typename T>
-class ReduceMin<sycl_reduce, T>
-    : public TargetReduce<RAJA::reduce::min<T>, T>
+class ReduceMin<sycl_reduce, T> : public TargetReduce<RAJA::reduce::min<T>, T>
 {
 public:
-
-  using self = ReduceMin<sycl_reduce, T>;
+  using self   = ReduceMin<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::min<T>, T>;
   using parent::parent;
 
   //! enable min() for ReduceMin -- alias for reduce()
-  self &min(T rhsVal)
+  self& min(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm.fetch_min(rhsVal);
     return *this;
 #else
@@ -533,11 +568,15 @@ class ReduceMin<sycl_reduce, T>
   }
 
   //! enable min() for ReduceMin -- alias for reduce()
-  const self &min(T rhsVal) const
+  const self& min(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm.fetch_min(rhsVal);
     return *this;
 #else
@@ -550,21 +589,23 @@ class ReduceMin<sycl_reduce, T>
 
 //! specialization of ReduceMax for omp_target_reduce
 template <typename T>
-class ReduceMax<sycl_reduce, T>
-    : public TargetReduce<RAJA::reduce::max<T>, T>
+class ReduceMax<sycl_reduce, T> : public TargetReduce<RAJA::reduce::max<T>, T>
 {
 public:
-
-  using self = ReduceMax<sycl_reduce, T>;
+  using self   = ReduceMax<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::max<T>, T>;
   using parent::parent;
 
   //! enable max() for ReduceMax -- alias for reduce()
-  self &max(T rhsVal)
+  self& max(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm.fetch_max(rhsVal);
     return *this;
 #else
@@ -574,11 +615,15 @@ class ReduceMax<sycl_reduce, T>
   }
 
   //! enable max() for ReduceMax -- alias for reduce()
-  const self &max(T rhsVal) const
+  const self& max(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm.fetch_max(rhsVal);
     return *this;
 #else
diff --git a/include/RAJA/policy/tensor/arch.hpp b/include/RAJA/policy/tensor/arch.hpp
index 771adea64f..50de38b80a 100644
--- a/include/RAJA/policy/tensor/arch.hpp
+++ b/include/RAJA/policy/tensor/arch.hpp
@@ -23,26 +23,27 @@
 namespace RAJA
 {
 
-namespace internal {
+namespace internal
+{
 
 namespace expt
 {
 
 
-  /*!
-   * Provides architectural details for a given architecture and data type.
-   */
-  template<typename REGISTER_POLICY, typename T>
-  struct RegisterTraits;
-  /*
-   * using element_type = T;
-   * using register_policy = REGISTER_POLICY;
-   * static constexpr camp::idx s_num_bits = X;
-   * static constexpr camp::idx s_num_elem = Y;
-   *
-   */
-} //namespace expt
-} //namespace internal
+/*!
+ * Provides architectural details for a given architecture and data type.
+ */
+template <typename REGISTER_POLICY, typename T>
+struct RegisterTraits;
+/*
+ * using element_type = T;
+ * using register_policy = REGISTER_POLICY;
+ * static constexpr camp::idx s_num_bits = X;
+ * static constexpr camp::idx s_num_elem = Y;
+ *
+ */
+}  // namespace expt
+}  // namespace internal
 //
 //////////////////////////////////////////////////////////////////////
 //
@@ -54,7 +55,8 @@ namespace expt
 {
 
 #ifdef __AVX512F__
-struct avx512_register {};
+struct avx512_register
+{};
 
 #ifndef RAJA_TENSOR_REGISTER_TYPE
 #define RAJA_TENSOR_REGISTER_TYPE RAJA::expt::avx512_register
@@ -63,7 +65,8 @@ struct avx512_register {};
 
 
 #ifdef __AVX2__
-struct avx2_register {};
+struct avx2_register
+{};
 
 #ifndef RAJA_TENSOR_REGISTER_TYPE
 #define RAJA_TENSOR_REGISTER_TYPE RAJA::expt::avx2_register
@@ -72,7 +75,8 @@ struct avx2_register {};
 
 
 #ifdef __AVX__
-struct avx_register {};
+struct avx_register
+{};
 
 #ifndef RAJA_TENSOR_REGISTER_TYPE
 #define RAJA_TENSOR_REGISTER_TYPE RAJA::expt::avx_register
@@ -85,7 +89,8 @@ struct avx_register {};
 /*!
  * A CUDA warp distributed vector register
  */
-struct cuda_warp_register {};
+struct cuda_warp_register
+{};
 
 #endif
 
@@ -96,12 +101,14 @@ struct cuda_warp_register {};
  * A HIP wavefront distributed vector register
  * On AMD GPUs this is rally just a vector register
  */
-struct hip_wave_register {};
+struct hip_wave_register
+{};
 
 #endif
 
 // The scalar register is always supported (doesn't require any SIMD/SIMT)
-struct scalar_register {};
+struct scalar_register
+{};
 
 #ifndef RAJA_TENSOR_REGISTER_TYPE
 #define RAJA_TENSOR_REGISTER_TYPE RAJA::expt::scalar_register
@@ -109,13 +116,12 @@ struct scalar_register {};
 #endif
 
 
-  // This sets the default SIMD register that will be used
-  using default_register = RAJA_TENSOR_REGISTER_TYPE;
-
+// This sets the default SIMD register that will be used
+using default_register = RAJA_TENSOR_REGISTER_TYPE;
 
-} // namespace expt
-} // namespace RAJA
 
+}  // namespace expt
+}  // namespace RAJA
 
 
 //
diff --git a/include/RAJA/policy/tensor/arch/avx.hpp b/include/RAJA/policy/tensor/arch/avx.hpp
index ed25f1f3e3..c0df27fac9 100644
--- a/include/RAJA/policy/tensor/arch/avx.hpp
+++ b/include/RAJA/policy/tensor/arch/avx.hpp
@@ -17,11 +17,11 @@
 
 #ifdef __AVX__
 
-#include<RAJA/policy/tensor/arch/avx/traits.hpp>
-#include<RAJA/policy/tensor/arch/avx/avx_int64.hpp>
-#include<RAJA/policy/tensor/arch/avx/avx_int32.hpp>
-#include<RAJA/policy/tensor/arch/avx/avx_float.hpp>
-#include<RAJA/policy/tensor/arch/avx/avx_double.hpp>
+#include <RAJA/policy/tensor/arch/avx/traits.hpp>
+#include <RAJA/policy/tensor/arch/avx/avx_int64.hpp>
+#include <RAJA/policy/tensor/arch/avx/avx_int32.hpp>
+#include <RAJA/policy/tensor/arch/avx/avx_float.hpp>
+#include <RAJA/policy/tensor/arch/avx/avx_double.hpp>
 
 
-#endif // __AVX__
+#endif  // __AVX__
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_double.hpp b/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
index 8a23d66e57..2978673727 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
@@ -34,444 +34,462 @@ namespace RAJA
 namespace expt
 {
 
-  template<>
-  class Register<double, avx_register> :
-    public internal::expt::RegisterBase<Register<double, avx_register>>
+template <>
+class Register<double, avx_register>
+    : public internal::expt::RegisterBase<Register<double, avx_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<double, avx_register>>;
+
+  using register_policy = avx_register;
+  using self_type       = Register<double, avx_register>;
+  using element_type    = double;
+  using register_type   = __m256d;
+
+  using int_vector_type = Register<int64_t, avx_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
+                             N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi64x(3 * stride, 2 * stride, stride, 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 4;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm256_setzero_pd()) {}
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0, element_type x1, element_type x2, element_type x3)
+      : base_type(), m_value(_mm256_set_pd(x3, x2, x1, x0))
+  {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_pd(c)) {}
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = _mm256_loadu_pd(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    m_value = _mm256_maskload_pd(ptr, createMask(N));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    for (camp::idx_t i = 0; i < 4; ++i)
+    {
+      m_value[i] = ptr[i * stride];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    m_value = _mm256_setzero_pd();
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      m_value[i] = ptr[i * stride];
+    };
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    _mm256_storeu_pd(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_pd(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 4; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<double, avx_register>>;
-
-      using register_policy = avx_register;
-      using self_type = Register<double, avx_register>;
-      using element_type = double;
-      using register_type = __m256d;
-
-      using int_vector_type = Register<int64_t, avx_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi64x(
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi64x(3*stride, 2*stride, stride, 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 4;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm256_setzero_pd()) {
-      }
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3) :
-                     base_type(), m_value(_mm256_set_pd(x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_pd(c)) {}
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_pd(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_maskload_pd(ptr, createMask(N));
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        for(camp::idx_t i = 0;i < 4;++ i){
-          m_value[i] = ptr[i*stride];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_setzero_pd();
-        for(camp::idx_t i = 0;i < N;++ i){
-          m_value[i] = ptr[i*stride];
-        };
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_pd(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_pd(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 4;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_pd(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm256_mul_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm256_div_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply a masked divide, so do it manually
-        return self_type(_mm256_set_pd(
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        auto sh1 = _mm256_permute_pd(m_value, 0x5);
-        auto red1 = _mm256_add_pd(m_value, sh1);
-        return red1[0]+red1[2];
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        // permute the first two and last two lanes of the register
-        // A = { v[1], v[0], v[3], v[2] }
-        register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-        // take the maximum value of each lane
-        // B = { max{v[0], v[1]},
-        //       max{v[0], v[1]},
-        //       max{v[2], v[3]},
-        //       max{v[2], v[3]} }
-        register_type b = _mm256_max_pd(m_value, a);
-
-        // now take the maximum of a lower and upper halves
-        return RAJA::max<element_type>(b[0], b[2]);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        if(N == 4){
-          // permute the first two and last two lanes of the register
-          // A = { v[1], v[0], v[3], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-          // take the maximum value of each lane
-          // B = { max{v[0], v[1]},
-          //       max{v[0], v[1]},
-          //       max{v[2], v[3]},
-          //       max{v[2], v[3]} }
-          register_type b = _mm256_max_pd(m_value, a);
-
-          // now take the maximum of a lower and upper halves
-          return RAJA::max<element_type>(b[0], b[2]);
-        }
-        else if(N == 3){
-          // permute the first two and last two lanes of the register
-          // use the third element TWICE, so we effectively remove the 4th
-          // lane
-          // A = { v[1], v[0], v[2], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
-
-          // take the maximum value of each lane
-          // B = { max{v[0], v[1]},
-          //       max{v[0], v[1]},
-          //       max{v[2], v[2]},   <-- just v[2]
-          //       max{v[2], v[3]} }
-          register_type b = _mm256_max_pd(m_value, a);
-
-          // now take the maximum of a lower and upper lane
-          return RAJA::max<element_type>(b[0], b[2]);
-        }
-        else if(N == 2){
-          return RAJA::max<element_type>(m_value[0], m_value[1]);
-        }
-        else if(N == 1){
-          return m_value[0];
-        }
-        return RAJA::operators::limits<double>::min();
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm256_max_pd(m_value, a.m_value));
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        // permute the first two and last two lanes of the register
-        // A = { v[1], v[0], v[3], v[2] }
-        register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-        // take the minimum value of each lane
-        // B = { min{v[0], v[1]},
-        //       min{v[0], v[1]},
-        //       min{v[2], v[3]},
-        //       min{v[2], v[3]} }
-        register_type b = _mm256_min_pd(m_value, a);
-
-        // now take the minimum of a lower and upper halves
-        return RAJA::min<element_type>(b[0], b[2]);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        if(N == 4){
-          // permute the first two and last two lanes of the register
-          // A = { v[1], v[0], v[3], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-          // take the minimum value of each lane
-          // B = { min{v[0], v[1]},
-          //       min{v[0], v[1]},
-          //       min{v[2], v[3]},
-          //       min{v[2], v[3]} }
-          register_type b = _mm256_min_pd(m_value, a);
-
-          // now take the minimum of a lower and upper halves
-          return RAJA::min<element_type>(b[0], b[2]);
-        }
-        else if(N == 3){
-          // permute the first two and last two lanes of the register
-          // use the third element TWICE, so we effectively remove the 4th
-          // lane
-          // A = { v[1], v[0], v[2], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
-
-          // take the minimum value of each lane
-          // B = { min{v[0], v[1]},
-          //       min{v[0], v[1]},
-          //       min{v[2], v[2]},   <-- just v[2]
-          //       min{v[2], v[3]} }
-          register_type b = _mm256_min_pd(m_value, a);
-
-          // now take the minimum of a lower and upper lane
-          return RAJA::min<element_type>(b[0], b[2]);
-        }
-        else if(N == 2){
-          return RAJA::min<element_type>(m_value[0], m_value[1]);
-        }
-        else if(N == 1){
-          return m_value[0];
-        }
-        return RAJA::operators::limits<double>::max();
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm256_min_pd(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+    m_value = _mm256_set1_pd(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm256_add_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm256_sub_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm256_mul_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(_mm256_div_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply a masked divide, so do it manually
+    return self_type(_mm256_set_pd(
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+    auto sh1  = _mm256_permute_pd(m_value, 0x5);
+    auto red1 = _mm256_add_pd(m_value, sh1);
+    return red1[0] + red1[2];
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
+  {
+    // permute the first two and last two lanes of the register
+    // A = { v[1], v[0], v[3], v[2] }
+    register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+    // take the maximum value of each lane
+    // B = { max{v[0], v[1]},
+    //       max{v[0], v[1]},
+    //       max{v[2], v[3]},
+    //       max{v[2], v[3]} }
+    register_type b = _mm256_max_pd(m_value, a);
+
+    // now take the maximum of a lower and upper halves
+    return RAJA::max<element_type>(b[0], b[2]);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    if (N == 4)
+    {
+      // permute the first two and last two lanes of the register
+      // A = { v[1], v[0], v[3], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+      // take the maximum value of each lane
+      // B = { max{v[0], v[1]},
+      //       max{v[0], v[1]},
+      //       max{v[2], v[3]},
+      //       max{v[2], v[3]} }
+      register_type b = _mm256_max_pd(m_value, a);
+
+      // now take the maximum of a lower and upper halves
+      return RAJA::max<element_type>(b[0], b[2]);
+    }
+    else if (N == 3)
+    {
+      // permute the first two and last two lanes of the register
+      // use the third element TWICE, so we effectively remove the 4th
+      // lane
+      // A = { v[1], v[0], v[2], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
+
+      // take the maximum value of each lane
+      // B = { max{v[0], v[1]},
+      //       max{v[0], v[1]},
+      //       max{v[2], v[2]},   <-- just v[2]
+      //       max{v[2], v[3]} }
+      register_type b = _mm256_max_pd(m_value, a);
+
+      // now take the maximum of a lower and upper lane
+      return RAJA::max<element_type>(b[0], b[2]);
+    }
+    else if (N == 2)
+    {
+      return RAJA::max<element_type>(m_value[0], m_value[1]);
+    }
+    else if (N == 1)
+    {
+      return m_value[0];
+    }
+    return RAJA::operators::limits<double>::min();
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_max_pd(m_value, a.m_value));
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+    // permute the first two and last two lanes of the register
+    // A = { v[1], v[0], v[3], v[2] }
+    register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+    // take the minimum value of each lane
+    // B = { min{v[0], v[1]},
+    //       min{v[0], v[1]},
+    //       min{v[2], v[3]},
+    //       min{v[2], v[3]} }
+    register_type b = _mm256_min_pd(m_value, a);
+
+    // now take the minimum of a lower and upper halves
+    return RAJA::min<element_type>(b[0], b[2]);
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    if (N == 4)
+    {
+      // permute the first two and last two lanes of the register
+      // A = { v[1], v[0], v[3], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+      // take the minimum value of each lane
+      // B = { min{v[0], v[1]},
+      //       min{v[0], v[1]},
+      //       min{v[2], v[3]},
+      //       min{v[2], v[3]} }
+      register_type b = _mm256_min_pd(m_value, a);
+
+      // now take the minimum of a lower and upper halves
+      return RAJA::min<element_type>(b[0], b[2]);
+    }
+    else if (N == 3)
+    {
+      // permute the first two and last two lanes of the register
+      // use the third element TWICE, so we effectively remove the 4th
+      // lane
+      // A = { v[1], v[0], v[2], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
+
+      // take the minimum value of each lane
+      // B = { min{v[0], v[1]},
+      //       min{v[0], v[1]},
+      //       min{v[2], v[2]},   <-- just v[2]
+      //       min{v[2], v[3]} }
+      register_type b = _mm256_min_pd(m_value, a);
+
+      // now take the minimum of a lower and upper lane
+      return RAJA::min<element_type>(b[0], b[2]);
+    }
+    else if (N == 2)
+    {
+      return RAJA::min<element_type>(m_value[0], m_value[1]);
+    }
+    else if (N == 1)
+    {
+      return m_value[0];
+    }
+    return RAJA::operators::limits<double>::max();
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_min_pd(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX__
+#endif  //__AVX__
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_float.hpp b/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
index 1e6563742a..6330f95525 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
@@ -34,457 +34,481 @@ namespace RAJA
 namespace expt
 {
 
-  template<>
-  class Register<float, avx_register> :
-    public internal::expt::RegisterBase<Register<float, avx_register>>
+template <>
+class Register<float, avx_register>
+    : public internal::expt::RegisterBase<Register<float, avx_register>>
+{
+public:
+  using base_type = internal::expt::RegisterBase<Register<float, avx_register>>;
+
+  using register_policy = avx_register;
+  using self_type       = Register<float, avx_register>;
+  using element_type    = float;
+  using register_type   = __m256;
+
+  using int_vector_type = Register<int32_t, avx_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi32(N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0,
+                            N >= 5 ? -1 : 0, N >= 4 ? -1 : 0, N >= 3 ? -1 : 0,
+                            N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm256_setzero_ps()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0,
+           element_type x1,
+           element_type x2,
+           element_type x3,
+           element_type x4,
+           element_type x5,
+           element_type x6,
+           element_type x7)
+      : m_value(_mm256_set_ps(x7, x6, x5, x4, x3, x2, x1, x0))
+  {}
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_ps(c)) {}
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = _mm256_loadu_ps(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    m_value = _mm256_maskload_ps(ptr, createMask(N));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    for (camp::idx_t i = 0; i < 8; ++i)
+    {
+      m_value[i] = ptr[i * stride];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    m_value = _mm256_setzero_ps();
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      m_value[i] = ptr[i * stride];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    _mm256_storeu_ps(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_ps(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 8; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm256_set1_ps(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm256_add_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm256_sub_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm256_mul_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(_mm256_div_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply a masked divide
+    return self_type(_mm256_set_ps(
+        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
+        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<float, avx_register>>;
-
-      using register_policy = avx_register;
-      using self_type = Register<float, avx_register>;
-      using element_type = float;
-      using register_type = __m256;
-
-      using int_vector_type = Register<int32_t, avx_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi32(
-            N >= 8 ? -1 : 0,
-            N >= 7 ? -1 : 0,
-            N >= 6 ? -1 : 0,
-            N >= 5 ? -1 : 0,
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm256_setzero_ps()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3,
-                     element_type x4,
-                     element_type x5,
-                     element_type x6,
-                     element_type x7) :
-        m_value(_mm256_set_ps(x7,x6,x5,x4,x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_ps(c)) {}
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_ps(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_maskload_ps(ptr, createMask(N));
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        for(camp::idx_t i = 0;i < 8;++ i){
-          m_value[i] = ptr[i*stride];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_setzero_ps();
-        for(camp::idx_t i = 0;i < N;++ i){
-          m_value[i] = ptr[i*stride];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_ps(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_ps(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 8;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_ps(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm256_mul_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm256_div_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N ) const {
-        // AVX2 does not supply a masked divide
-        return self_type(_mm256_set_ps(
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-        auto red1 = _mm256_add_ps(m_value, sh1);
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_add_ps(red1, sh2);
-
-        return red2[0] + red2[4];
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        // swap odd-even pairs and combine
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-        auto red1 = _mm256_max_ps(m_value, sh1);
-
-        // swap odd-even quads and combine
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_max_ps(red1, sh2);
-
-        // combine quads
-        return RAJA::max<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns the largest element of first N lanes
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N >8){
-          return RAJA::operators::limits<float>::min();
-        }
-        if(N == 1){
-          return m_value[0];
-        }
-        if(N == 2){
-          return RAJA::max<element_type>(m_value[0], m_value[1]);
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-
-        if(N == 7){
-          // blend out the 8th lane of the permute
-          sh1 = _mm256_blend_ps(sh1, m_value, 0x40);
-        }
-
-        auto red1 = _mm256_max_ps(m_value, sh1);
-
-        // Some more simple shortcuts
-        if(N == 3){
-          return RAJA::max<element_type>(red1[0], m_value[2]);
-        }
-
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_max_ps(red1, sh2);
-
-        if(N == 4){
-          return red2[0];
-        }
-        if(N == 5){
-          return RAJA::max<element_type>(red2[0], m_value[4]);
-        }
-        if(N == 6){
-          return RAJA::max<element_type>(red2[0], red1[4]);
-        }
-
-        // 7 or 8 lanes
-        return RAJA::max<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm256_max_ps(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        // swap odd-even pairs and combine
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-        auto red1 = _mm256_min_ps(m_value, sh1);
-
-        // swap odd-even quads and combine
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_min_ps(red1, sh2);
-
-        // combine quads
-        return RAJA::min<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N >8){
-          return RAJA::operators::limits<float>::max();
-        }
-        if(N == 1){
-          return m_value[0];
-        }
-        if(N == 2){
-          return RAJA::min<element_type>(m_value[0], m_value[1]);
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-
-        if(N == 7){
-          // blend out the 8th lane of the permute
-          sh1 = _mm256_blend_ps(sh1, m_value, 0x40);
-        }
-
-        auto red1 = _mm256_min_ps(m_value, sh1);
-
-        // Some more simple shortcuts
-        if(N == 3){
-          return RAJA::min<element_type>(red1[0], m_value[2]);
-        }
-
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_min_ps(red1, sh2);
-
-        if(N == 4){
-          return red2[0];
-        }
-        if(N == 5){
-          return RAJA::min<element_type>(red2[0], m_value[4]);
-        }
-        if(N == 6){
-          return RAJA::min<element_type>(red2[0], red1[4]);
-        }
-
-        // 7 or 8 lanes
-        return RAJA::min<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm256_min_ps(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permute_ps(m_value, 0xB1);
+    auto red1 = _mm256_add_ps(m_value, sh1);
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_add_ps(red1, sh2);
+
+    return red2[0] + red2[4];
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
+  {
+    // swap odd-even pairs and combine
+    auto sh1  = _mm256_permute_ps(m_value, 0xB1);
+    auto red1 = _mm256_max_ps(m_value, sh1);
+
+    // swap odd-even quads and combine
+    auto sh2  = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_max_ps(red1, sh2);
+
+    // combine quads
+    return RAJA::max<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns the largest element of first N lanes
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<float>::min();
+    }
+    if (N == 1)
+    {
+      return m_value[0];
+    }
+    if (N == 2)
+    {
+      return RAJA::max<element_type>(m_value[0], m_value[1]);
+    }
+
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permute_ps(m_value, 0xB1);
+
+    if (N == 7)
+    {
+      // blend out the 8th lane of the permute
+      sh1 = _mm256_blend_ps(sh1, m_value, 0x40);
+    }
+
+    auto red1 = _mm256_max_ps(m_value, sh1);
+
+    // Some more simple shortcuts
+    if (N == 3)
+    {
+      return RAJA::max<element_type>(red1[0], m_value[2]);
+    }
+
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_max_ps(red1, sh2);
+
+    if (N == 4)
+    {
+      return red2[0];
+    }
+    if (N == 5)
+    {
+      return RAJA::max<element_type>(red2[0], m_value[4]);
+    }
+    if (N == 6)
+    {
+      return RAJA::max<element_type>(red2[0], red1[4]);
+    }
+
+    // 7 or 8 lanes
+    return RAJA::max<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_max_ps(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+    // swap odd-even pairs and combine
+    auto sh1  = _mm256_permute_ps(m_value, 0xB1);
+    auto red1 = _mm256_min_ps(m_value, sh1);
+
+    // swap odd-even quads and combine
+    auto sh2  = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_min_ps(red1, sh2);
+
+    // combine quads
+    return RAJA::min<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<float>::max();
+    }
+    if (N == 1)
+    {
+      return m_value[0];
+    }
+    if (N == 2)
+    {
+      return RAJA::min<element_type>(m_value[0], m_value[1]);
+    }
+
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permute_ps(m_value, 0xB1);
+
+    if (N == 7)
+    {
+      // blend out the 8th lane of the permute
+      sh1 = _mm256_blend_ps(sh1, m_value, 0x40);
+    }
+
+    auto red1 = _mm256_min_ps(m_value, sh1);
+
+    // Some more simple shortcuts
+    if (N == 3)
+    {
+      return RAJA::min<element_type>(red1[0], m_value[2]);
+    }
+
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_min_ps(red1, sh2);
+
+    if (N == 4)
+    {
+      return red2[0];
+    }
+    if (N == 5)
+    {
+      return RAJA::min<element_type>(red2[0], m_value[4]);
+    }
+    if (N == 6)
+    {
+      return RAJA::min<element_type>(red2[0], red1[4]);
+    }
+
+    // 7 or 8 lanes
+    return RAJA::min<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_min_ps(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX__
+#endif  //__AVX__
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp b/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
index 11ab97be16..abbce3482b 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
@@ -33,738 +33,764 @@ namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<int32_t, avx_register> :
-    public internal::expt::RegisterBase<Register<int32_t, avx_register>>
+template <>
+class Register<int32_t, avx_register>
+    : public internal::expt::RegisterBase<Register<int32_t, avx_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int32_t, avx_register>>;
+
+  using register_policy = avx_register;
+  using self_type       = Register<int32_t, avx_register>;
+  using element_type    = int32_t;
+  using register_type   = __m256i;
+
+  using int_vector_type = Register<int32_t, avx_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi32(N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0,
+                            N >= 5 ? -1 : 0, N >= 4 ? -1 : 0, N >= 3 ? -1 : 0,
+                            N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi32(7 * stride, 6 * stride, 5 * stride, 4 * stride,
+                            3 * stride, 2 * stride, stride, 0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute1(camp::idx_t N) const
+  {
+    // Generate a permutation for first round of min/max routines
+    return _mm256_set_epi32(N >= 7 ? 6 : 0, N >= 8 ? 7 : 0, N >= 5 ? 4 : 0,
+                            N >= 6 ? 5 : 0, N >= 3 ? 2 : 0, N >= 4 ? 3 : 0,
+                            N >= 1 ? 0 : 0, N >= 2 ? 1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute2(camp::idx_t N) const
+  {
+    // Generate a permutation for second round of min/max routines
+    return _mm256_set_epi32(N >= 6 ? 5 : 0, N >= 5 ? 4 : 0, N >= 8 ? 7 : 0,
+                            N >= 7 ? 6 : 0, N >= 2 ? 1 : 0, N >= 1 ? 0 : 0,
+                            N >= 4 ? 3 : 0, N >= 2 ? 2 : 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm256_setzero_si256()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0,
+           element_type x1,
+           element_type x2,
+           element_type x3,
+           element_type x4,
+           element_type x5,
+           element_type x6,
+           element_type x7)
+      : m_value(_mm256_set_epi32(x7, x6, x5, x4, x3, x2, x1, x0))
+  {}
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_epi32(c)) {}
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = _mm256_loadu_si256((__m256i const*)ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    m_value = _mm256_setzero_si256();
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      set(ptr[i], i);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    for (camp::idx_t i = 0; i < 8; ++i)
+    {
+      set(ptr[i * stride], i);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    m_value = _mm256_setzero_si256();
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      set(ptr[i * stride], i);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_ps(reinterpret_cast<float*>(ptr), createMask(N),
+                        reinterpret_cast<__m256>(m_value));
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 8; ++i)
+    {
+      ptr[i * stride] = get(i);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = get(i);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+    case 0:
+      return _mm256_extract_epi32(m_value, 0);
+    case 1:
+      return _mm256_extract_epi32(m_value, 1);
+    case 2:
+      return _mm256_extract_epi32(m_value, 2);
+    case 3:
+      return _mm256_extract_epi32(m_value, 3);
+    case 4:
+      return _mm256_extract_epi32(m_value, 4);
+    case 5:
+      return _mm256_extract_epi32(m_value, 5);
+    case 6:
+      return _mm256_extract_epi32(m_value, 6);
+    case 7:
+      return _mm256_extract_epi32(m_value, 7);
+    }
+    return 0;
+  }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+    case 0:
+      m_value = _mm256_insert_epi32(m_value, value, 0);
+      break;
+    case 1:
+      m_value = _mm256_insert_epi32(m_value, value, 1);
+      break;
+    case 2:
+      m_value = _mm256_insert_epi32(m_value, value, 2);
+      break;
+    case 3:
+      m_value = _mm256_insert_epi32(m_value, value, 3);
+      break;
+    case 4:
+      m_value = _mm256_insert_epi32(m_value, value, 4);
+      break;
+    case 5:
+      m_value = _mm256_insert_epi32(m_value, value, 5);
+      break;
+    case 6:
+      m_value = _mm256_insert_epi32(m_value, value, 6);
+      break;
+    case 7:
+      m_value = _mm256_insert_epi32(m_value, value, 7);
+      break;
+    }
+
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm256_set1_epi32(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    // no 8-way 32-bit add, but there is a 4-way... split and conquer
+
+    // Low 128-bits  - use _mm256_castsi256_si128???
+    auto low_a   = _mm256_castsi256_si128(m_value);
+    auto low_b   = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_add_epi32(low_a, low_b));
+
+    // Hi 128-bits
+    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_add_epi32(hi_a, hi_b);
+
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    // no 8-way 32-bit subtract, but there is a 4-way... split and conquer
+
+    // Low 128-bits
+    auto low_a   = _mm256_castsi256_si128(m_value);
+    auto low_b   = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_sub_epi32(low_a, low_b));
+
+    // Hi 128-bits
+    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_sub_epi32(hi_a, hi_b);
+
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    // no 8-way 32-bit multiply, but there is a 32x32 -> 64
+    // This gets ugly :)
+
+    // Low 128-bits
+    auto low_a = _mm256_castsi256_si128(m_value);
+    auto low_b = _mm256_castsi256_si128(b.m_value);
+    // multiply even lanes 0, 2
+    auto res_low_even = _mm_mul_epi32(low_a, low_b);
+
+    // multiply odd lanes 1, 3
+    auto low_a_sh    = _mm_shuffle_epi32(low_a, 0xB1);
+    auto low_b_sh    = _mm_shuffle_epi32(low_b, 0xB1);
+    auto res_low_odd = _mm_mul_epi32(low_a_sh, low_b_sh);
+
+    // recombine to get all 4 lanes
+    // note: AVX doesn't have a int32 blend, so we use the float32 blend
+    res_low_odd  = _mm_shuffle_epi32(res_low_odd, 0xB1);
+    auto res_low = _mm256_castsi128_si256(_mm_castps_si128(_mm_blend_ps(
+        _mm_castsi128_ps(res_low_odd), _mm_castsi128_ps(res_low_even), 0x05)));
+
+
+    // High 128-bits
+    auto hi_a = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+    // multiply even lanes 0, 2
+    auto res_hi_even = _mm_mul_epi32(hi_a, hi_b);
+
+    // multiply odd lanes 1, 3
+    auto hi_a_sh    = _mm_shuffle_epi32(hi_a, 0xB1);
+    auto hi_b_sh    = _mm_shuffle_epi32(hi_b, 0xB1);
+    auto res_hi_odd = _mm_mul_epi32(hi_a_sh, hi_b_sh);
+
+    // recombine to get all 4 lanes
+    // note: AVX doesn't have a int32 blend, so we use the float32 blend
+    res_hi_odd  = _mm_shuffle_epi32(res_hi_odd, 0xB1);
+    auto res_hi = _mm_castps_si128(_mm_blend_ps(
+        _mm_castsi128_ps(res_hi_odd), _mm_castsi128_ps(res_hi_even), 0x05));
+
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi32(get(7) / b.get(7), get(6) / b.get(6),
+                                      get(5) / b.get(5), get(4) / b.get(4),
+                                      get(3) / b.get(3), get(2) / b.get(2),
+                                      get(1) / b.get(1), get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi32(
+        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
+        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+    // Low 128-bits
+    auto low = _mm256_castsi256_si128(m_value);
+
+    auto low_sh1  = _mm_shuffle_epi32(low, 0xB1);
+    auto low_red1 = _mm_add_epi32(low, low_sh1);
+
+    auto low_sh2  = _mm_shuffle_epi32(low_red1, 0x1B);
+    auto low_red2 = _mm_add_epi32(low_red1, low_sh2);
+
+
+    // High 128-bits
+    auto hi = _mm256_extractf128_si256(m_value, 1);
+
+    auto hi_sh1  = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_red1 = _mm_add_epi32(hi, hi_sh1);
+
+    auto hi_sh2  = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_red2 = _mm_add_epi32(hi_red1, hi_sh2);
+
+
+    // Sum halves, extract total sum
+    auto hi_low = _mm_add_epi32(hi_red2, low_red2);
+    return _mm_extract_epi32(hi_low, 0);
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int32_t, avx_register>>;
-
-      using register_policy = avx_register;
-      using self_type = Register<int32_t, avx_register>;
-      using element_type = int32_t;
-      using register_type = __m256i;
-
-      using int_vector_type = Register<int32_t, avx_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi32(
-            N >= 8 ? -1 : 0,
-            N >= 7 ? -1 : 0,
-            N >= 6 ? -1 : 0,
-            N >= 5 ? -1 : 0,
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi32(
-            7*stride, 6*stride, 5*stride, 4*stride,
-            3*stride, 2*stride, stride, 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute1(camp::idx_t N) const {
-        // Generate a permutation for first round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 7 ? 6 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 6 ? 5 : 0,
-            N >= 3 ? 2 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 2 ? 1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute2(camp::idx_t N) const {
-        // Generate a permutation for second round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 6 ? 5 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 7 ? 6 : 0,
-            N >= 2 ? 1 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 2 ? 2 : 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm256_setzero_si256()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3,
-                     element_type x4,
-                     element_type x5,
-                     element_type x6,
-                     element_type x7) :
-        m_value(_mm256_set_epi32(x7,x6,x5,x4,x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_epi32(c)) {}
-
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_si256((__m256i const *)ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_setzero_si256();
-        for(camp::idx_t i = 0;i < N;++ i){
-          set(ptr[i], i);
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        for(camp::idx_t i = 0;i < 8;++ i){
-          set(ptr[i*stride], i);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_setzero_si256();
-        for(camp::idx_t i = 0;i < N;++ i){
-          set(ptr[i*stride], i);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_ps(reinterpret_cast<float*>(ptr), createMask(N), reinterpret_cast<__m256>(m_value));
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 8;++ i){
-          ptr[i*stride] = get(i);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = get(i);
-        }
-        return *this;
-      }
-
-
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: return _mm256_extract_epi32(m_value, 0);
-          case 1: return _mm256_extract_epi32(m_value, 1);
-          case 2: return _mm256_extract_epi32(m_value, 2);
-          case 3: return _mm256_extract_epi32(m_value, 3);
-          case 4: return _mm256_extract_epi32(m_value, 4);
-          case 5: return _mm256_extract_epi32(m_value, 5);
-          case 6: return _mm256_extract_epi32(m_value, 6);
-          case 7: return _mm256_extract_epi32(m_value, 7);
-        }
-        return 0;
-      }
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: m_value = _mm256_insert_epi32(m_value, value, 0); break;
-          case 1: m_value = _mm256_insert_epi32(m_value, value, 1); break;
-          case 2: m_value = _mm256_insert_epi32(m_value, value, 2); break;
-          case 3: m_value = _mm256_insert_epi32(m_value, value, 3); break;
-          case 4: m_value = _mm256_insert_epi32(m_value, value, 4); break;
-          case 5: m_value = _mm256_insert_epi32(m_value, value, 5); break;
-          case 6: m_value = _mm256_insert_epi32(m_value, value, 6); break;
-          case 7: m_value = _mm256_insert_epi32(m_value, value, 7); break;
-        }
-
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_epi32(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        // no 8-way 32-bit add, but there is a 4-way... split and conquer
-
-        // Low 128-bits  - use _mm256_castsi256_si128???
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_add_epi32(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_add_epi32(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        // no 8-way 32-bit subtract, but there is a 4-way... split and conquer
-
-        // Low 128-bits
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_sub_epi32(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_sub_epi32(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        // no 8-way 32-bit multiply, but there is a 32x32 -> 64
-        // This gets ugly :)
-
-        // Low 128-bits
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        // multiply even lanes 0, 2
-        auto res_low_even = _mm_mul_epi32(low_a, low_b);
-
-        // multiply odd lanes 1, 3
-        auto low_a_sh = _mm_shuffle_epi32(low_a, 0xB1);
-        auto low_b_sh = _mm_shuffle_epi32(low_b, 0xB1);
-        auto res_low_odd = _mm_mul_epi32(low_a_sh, low_b_sh);
-
-        // recombine to get all 4 lanes
-        // note: AVX doesn't have a int32 blend, so we use the float32 blend
-        res_low_odd = _mm_shuffle_epi32(res_low_odd, 0xB1);
-        auto res_low = _mm256_castsi128_si256(_mm_castps_si128(
-            _mm_blend_ps(_mm_castsi128_ps(res_low_odd),
-                         _mm_castsi128_ps(res_low_even),
-                         0x05)
-            ));
-
-
-        // High 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        // multiply even lanes 0, 2
-        auto res_hi_even = _mm_mul_epi32(hi_a, hi_b);
-
-        // multiply odd lanes 1, 3
-        auto hi_a_sh = _mm_shuffle_epi32(hi_a, 0xB1);
-        auto hi_b_sh = _mm_shuffle_epi32(hi_b, 0xB1);
-        auto res_hi_odd = _mm_mul_epi32(hi_a_sh, hi_b_sh);
-
-        // recombine to get all 4 lanes
-        // note: AVX doesn't have a int32 blend, so we use the float32 blend
-        res_hi_odd = _mm_shuffle_epi32(res_hi_odd, 0xB1);
-        auto res_hi = _mm_castps_si128(
-            _mm_blend_ps(_mm_castsi128_ps(res_hi_odd),
-                         _mm_castsi128_ps(res_hi_even),
-                         0x05)
-            );
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi32(
-            get(7)/b.get(7),
-            get(6)/b.get(6),
-            get(5)/b.get(5),
-            get(4)/b.get(4),
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi32(
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        // Low 128-bits
-        auto low = _mm256_castsi256_si128(m_value);
-
-        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
-        auto low_red1 = _mm_add_epi32(low, low_sh1);
-
-        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
-        auto low_red2 = _mm_add_epi32(low_red1, low_sh2);
-
-
-        // High 128-bits
-        auto hi = _mm256_extractf128_si256(m_value, 1);
-
-        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
-        auto hi_red1 = _mm_add_epi32(hi, hi_sh1);
-
-        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
-        auto hi_red2 = _mm_add_epi32(hi_red1, hi_sh2);
-
-
-        // Sum halves, extract total sum
-        auto hi_low = _mm_add_epi32(hi_red2, low_red2);
-        return _mm_extract_epi32(hi_low, 0);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        // this is just painful, since we don't have a proper masked permute
-        // in AVX.  Lots of special cases to make sure we compare just the
-        // right lanes
-
-
-        // Low 128-bits
-        auto low = _mm256_castsi256_si128(m_value);
-
-        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
-        auto low_red1 = _mm_max_epi32(low, low_sh1);
-
-        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
-
-        // lane 0 of low_red2 now has reduction of 0,1,2,3
-        auto low_red2 = _mm_max_epi32(low_red1, low_sh2);
-
-
-
-        // High 128-bits
-        auto hi = _mm256_extractf128_si256(m_value, 1);
-
-
-        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
-        auto hi_red1 = _mm_max_epi32(hi, hi_sh1);
-
-        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
-        auto hi_red2 = _mm_max_epi32(hi_red1, hi_sh2);
-
-
-        // Sum halves, extract final reduction
-        auto hi_low = _mm_max_epi32(hi_red2, low_red2);
-        return _mm_extract_epi32(hi_low, 0);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N > 8){
-          return RAJA::operators::limits<int32_t>::min();
-        }
-
-        // this is just painful, since we don't have a proper masked permute
-        // in AVX.  Lots of special cases to make sure we compare just the
-        // right lanes
-        if(N==1){
-          return _mm256_extract_epi32(m_value, 0);
-        }
+    // this is just painful, since we don't have a proper masked permute
+    // in AVX.  Lots of special cases to make sure we compare just the
+    // right lanes
+
+
+    // Low 128-bits
+    auto low = _mm256_castsi256_si128(m_value);
+
+    auto low_sh1  = _mm_shuffle_epi32(low, 0xB1);
+    auto low_red1 = _mm_max_epi32(low, low_sh1);
+
+    auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
 
-        // Low 128-bits
-        auto low = _mm256_castsi256_si128(m_value);
+    // lane 0 of low_red2 now has reduction of 0,1,2,3
+    auto low_red2 = _mm_max_epi32(low_red1, low_sh2);
 
-        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
-        auto low_red1 = _mm_max_epi32(low, low_sh1);
 
-        if(N==2){
-          return _mm_extract_epi32(low_red1, 0);
-        }
+    // High 128-bits
+    auto hi = _mm256_extractf128_si256(m_value, 1);
 
-        if(N==3){
-          // get lane 2 into lane 0
-          auto low_sh1a = _mm_shuffle_epi32(low, 0x2);
-          auto low_red1a = _mm_max_epi32(low_red1, low_sh1a);
-          return _mm_extract_epi32(low_red1a, 0);
-        }
 
-        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+    auto hi_sh1  = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_red1 = _mm_max_epi32(hi, hi_sh1);
 
-        // lane 0 of low_red2 now has reduction of 0,1,2,3
-        auto low_red2 = _mm_max_epi32(low_red1, low_sh2);
+    auto hi_sh2  = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_red2 = _mm_max_epi32(hi_red1, hi_sh2);
 
-        if(N==4){
-          return _mm_extract_epi32(low_red2, 0);
-        }
 
-        // High 128-bits
-        auto hi = _mm256_extractf128_si256(m_value, 1);
+    // Sum halves, extract final reduction
+    auto hi_low = _mm_max_epi32(hi_red2, low_red2);
+    return _mm_extract_epi32(hi_low, 0);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<int32_t>::min();
+    }
+
+    // this is just painful, since we don't have a proper masked permute
+    // in AVX.  Lots of special cases to make sure we compare just the
+    // right lanes
+    if (N == 1)
+    {
+      return _mm256_extract_epi32(m_value, 0);
+    }
+
+    // Low 128-bits
+    auto low = _mm256_castsi256_si128(m_value);
+
+    auto low_sh1  = _mm_shuffle_epi32(low, 0xB1);
+    auto low_red1 = _mm_max_epi32(low, low_sh1);
+
+    if (N == 2)
+    {
+      return _mm_extract_epi32(low_red1, 0);
+    }
+
+    if (N == 3)
+    {
+      // get lane 2 into lane 0
+      auto low_sh1a  = _mm_shuffle_epi32(low, 0x2);
+      auto low_red1a = _mm_max_epi32(low_red1, low_sh1a);
+      return _mm_extract_epi32(low_red1a, 0);
+    }
+
+    auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+
+    // lane 0 of low_red2 now has reduction of 0,1,2,3
+    auto low_red2 = _mm_max_epi32(low_red1, low_sh2);
+
+    if (N == 4)
+    {
+      return _mm_extract_epi32(low_red2, 0);
+    }
+
+    // High 128-bits
+    auto hi = _mm256_extractf128_si256(m_value, 1);
+
+    if (N == 5)
+    {
+      auto red_5 = _mm_max_epi32(low_red2, hi);
+      return _mm_extract_epi32(red_5, 0);
+    }
+
+    auto hi_sh1  = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_red1 = _mm_max_epi32(hi, hi_sh1);
+
+    if (N == 6)
+    {
+      auto red_6 = _mm_max_epi32(low_red2, hi_red1);
+      return _mm_extract_epi32(red_6, 0);
+    }
+    if (N == 7)
+    {
+      // get lane 6 (lane 2 of hi) into lane 0
+      auto hi_sh7   = _mm_shuffle_epi32(hi, 0x2);
+      auto hi_red_6 = _mm_max_epi32(hi_sh7, hi_red1);
+      auto red_7    = _mm_max_epi32(low_red2, hi_red_6);
+      return _mm_extract_epi32(red_7, 0);
+    }
+
+    auto hi_sh2  = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_red2 = _mm_max_epi32(hi_red1, hi_sh2);
+
+
+    // Sum halves, extract total sum
+    auto hi_low = _mm_max_epi32(hi_red2, low_red2);
+    return _mm_extract_epi32(hi_low, 0);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type b) const
+  {
+    // no 8-way 32-bit min, but there is a 4-way... split and conquer
+
+    // Low 128-bits  - use _mm256_castsi256_si128???
+    auto low_a   = _mm256_castsi256_si128(m_value);
+    auto low_b   = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_max_epi32(low_a, low_b));
+
+    // Hi 128-bits
+    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_max_epi32(hi_a, hi_b);
+
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+    // this is just painful, since we don't have a proper masked permute
+    // in AVX.  Lots of special cases to make sure we compare just the
+    // right lanes
+
+    // Low 128-bits
+    auto low = _mm256_castsi256_si128(m_value);
+
+    auto low_sh1  = _mm_shuffle_epi32(low, 0xB1);
+    auto low_red1 = _mm_min_epi32(low, low_sh1);
+
+    auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+
+    // lane 0 of low_red2 now has reduction of 0,1,2,3
+    auto low_red2 = _mm_min_epi32(low_red1, low_sh2);
+
+
+    // High 128-bits
+    auto hi = _mm256_extractf128_si256(m_value, 1);
+
+    auto hi_sh1  = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_red1 = _mm_min_epi32(hi, hi_sh1);
+
+
+    auto hi_sh2  = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_red2 = _mm_min_epi32(hi_red1, hi_sh2);
+
+
+    // Sum halves, extract total sum
+    auto hi_low = _mm_min_epi32(hi_red2, low_red2);
+    return _mm_extract_epi32(hi_low, 0);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<int32_t>::max();
+    }
+    // this is just painful, since we don't have a proper masked permute
+    // in AVX.  Lots of special cases to make sure we compare just the
+    // right lanes
+    if (N == 1)
+    {
+      return _mm256_extract_epi32(m_value, 0);
+    }
+
+    // Low 128-bits
+    auto low = _mm256_castsi256_si128(m_value);
+
+    auto low_sh1  = _mm_shuffle_epi32(low, 0xB1);
+    auto low_red1 = _mm_min_epi32(low, low_sh1);
+
+    if (N == 2)
+    {
+      return _mm_extract_epi32(low_red1, 0);
+    }
+
+    if (N == 3)
+    {
+      // get lane 2 into lane 0
+      auto low_sh1a  = _mm_shuffle_epi32(low, 0x2);
+      auto low_red1a = _mm_min_epi32(low_red1, low_sh1a);
+      return _mm_extract_epi32(low_red1a, 0);
+    }
+
+    auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+
+    // lane 0 of low_red2 now has reduction of 0,1,2,3
+    auto low_red2 = _mm_min_epi32(low_red1, low_sh2);
+
+    if (N == 4)
+    {
+      return _mm_extract_epi32(low_red2, 0);
+    }
+
+    // High 128-bits
+    auto hi = _mm256_extractf128_si256(m_value, 1);
+
+    if (N == 5)
+    {
+      auto red_5 = _mm_min_epi32(low_red2, hi);
+      return _mm_extract_epi32(red_5, 0);
+    }
+
+    auto hi_sh1  = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_red1 = _mm_min_epi32(hi, hi_sh1);
+
+    if (N == 6)
+    {
+      auto red_6 = _mm_min_epi32(low_red2, hi_red1);
+      return _mm_extract_epi32(red_6, 0);
+    }
+    if (N == 7)
+    {
+      // get lane 6 (lane 2 of hi) into lane 0
+      auto hi_sh7   = _mm_shuffle_epi32(hi, 0x2);
+      auto hi_red_6 = _mm_min_epi32(hi_sh7, hi_red1);
+      auto red_7    = _mm_min_epi32(low_red2, hi_red_6);
+      return _mm_extract_epi32(red_7, 0);
+    }
+
+    auto hi_sh2  = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_red2 = _mm_min_epi32(hi_red1, hi_sh2);
+
+
+    // Sum halves, extract total sum
+    auto hi_low = _mm_min_epi32(hi_red2, low_red2);
+    return _mm_extract_epi32(hi_low, 0);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type b) const
+  {
+    // no 8-way 32-bit min, but there is a 4-way... split and conquer
+
+    // Low 128-bits  - use _mm256_castsi256_si128???
+    auto low_a   = _mm256_castsi256_si128(m_value);
+    auto low_b   = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_min_epi32(low_a, low_b));
 
-        if(N==5){
-          auto red_5 = _mm_max_epi32(low_red2, hi);
-          return _mm_extract_epi32(red_5, 0);
-        }
-
-        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
-        auto hi_red1 = _mm_max_epi32(hi, hi_sh1);
-
-        if(N==6){
-          auto red_6 = _mm_max_epi32(low_red2, hi_red1);
-          return _mm_extract_epi32(red_6, 0);
-        }
-        if(N==7){
-          // get lane 6 (lane 2 of hi) into lane 0
-          auto hi_sh7 = _mm_shuffle_epi32(hi, 0x2);
-          auto hi_red_6 = _mm_max_epi32(hi_sh7, hi_red1);
-          auto red_7 = _mm_max_epi32(low_red2, hi_red_6);
-          return _mm_extract_epi32(red_7, 0);
-        }
+    // Hi 128-bits
+    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_min_epi32(hi_a, hi_b);
 
-        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
-        auto hi_red2 = _mm_max_epi32(hi_red1, hi_sh2);
-
-
-        // Sum halves, extract total sum
-        auto hi_low = _mm_max_epi32(hi_red2, low_red2);
-        return _mm_extract_epi32(hi_low, 0);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type b) const
-      {
-        // no 8-way 32-bit min, but there is a 4-way... split and conquer
-
-        // Low 128-bits  - use _mm256_castsi256_si128???
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_max_epi32(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_max_epi32(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        // this is just painful, since we don't have a proper masked permute
-        // in AVX.  Lots of special cases to make sure we compare just the
-        // right lanes
-
-        // Low 128-bits
-        auto low = _mm256_castsi256_si128(m_value);
-
-        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
-        auto low_red1 = _mm_min_epi32(low, low_sh1);
-
-        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
-
-        // lane 0 of low_red2 now has reduction of 0,1,2,3
-        auto low_red2 = _mm_min_epi32(low_red1, low_sh2);
-
-
-        // High 128-bits
-        auto hi = _mm256_extractf128_si256(m_value, 1);
-
-        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
-        auto hi_red1 = _mm_min_epi32(hi, hi_sh1);
-
-
-        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
-        auto hi_red2 = _mm_min_epi32(hi_red1, hi_sh2);
-
-
-        // Sum halves, extract total sum
-        auto hi_low = _mm_min_epi32(hi_red2, low_red2);
-        return _mm_extract_epi32(hi_low, 0);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N > 8){
-          return RAJA::operators::limits<int32_t>::max();
-        }
-        // this is just painful, since we don't have a proper masked permute
-        // in AVX.  Lots of special cases to make sure we compare just the
-        // right lanes
-        if(N==1){
-          return _mm256_extract_epi32(m_value, 0);
-        }
-
-        // Low 128-bits
-        auto low = _mm256_castsi256_si128(m_value);
-
-        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
-        auto low_red1 = _mm_min_epi32(low, low_sh1);
-
-        if(N==2){
-          return _mm_extract_epi32(low_red1, 0);
-        }
-
-        if(N==3){
-          // get lane 2 into lane 0
-          auto low_sh1a = _mm_shuffle_epi32(low, 0x2);
-          auto low_red1a = _mm_min_epi32(low_red1, low_sh1a);
-          return _mm_extract_epi32(low_red1a, 0);
-        }
-
-        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
-
-        // lane 0 of low_red2 now has reduction of 0,1,2,3
-        auto low_red2 = _mm_min_epi32(low_red1, low_sh2);
-
-        if(N==4){
-          return _mm_extract_epi32(low_red2, 0);
-        }
-
-        // High 128-bits
-        auto hi = _mm256_extractf128_si256(m_value, 1);
-
-        if(N==5){
-          auto red_5 = _mm_min_epi32(low_red2, hi);
-          return _mm_extract_epi32(red_5, 0);
-        }
-
-        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
-        auto hi_red1 = _mm_min_epi32(hi, hi_sh1);
-
-        if(N==6){
-          auto red_6 = _mm_min_epi32(low_red2, hi_red1);
-          return _mm_extract_epi32(red_6, 0);
-        }
-        if(N==7){
-          // get lane 6 (lane 2 of hi) into lane 0
-          auto hi_sh7 = _mm_shuffle_epi32(hi, 0x2);
-          auto hi_red_6 = _mm_min_epi32(hi_sh7, hi_red1);
-          auto red_7 = _mm_min_epi32(low_red2, hi_red_6);
-          return _mm_extract_epi32(red_7, 0);
-        }
-
-        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
-        auto hi_red2 = _mm_min_epi32(hi_red1, hi_sh2);
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+};
 
 
-        // Sum halves, extract total sum
-        auto hi_low = _mm_min_epi32(hi_red2, low_red2);
-        return _mm_extract_epi32(hi_low, 0);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type b) const
-      {
-        // no 8-way 32-bit min, but there is a 4-way... split and conquer
-
-        // Low 128-bits  - use _mm256_castsi256_si128???
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_min_epi32(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_min_epi32(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-  };
-
-
-}   // namespace expt
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp b/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
index 1c7fae3dc7..e0a03bec4f 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
@@ -33,506 +33,525 @@ namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<int64_t, avx_register> :
-    public internal::expt::RegisterBase<Register<int64_t, avx_register>>
+template <>
+class Register<int64_t, avx_register>
+    : public internal::expt::RegisterBase<Register<int64_t, avx_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int64_t, avx_register>>;
+
+  using register_policy = avx_register;
+  using self_type       = Register<int64_t, avx_register>;
+  using element_type    = int64_t;
+  using register_type   = __m256i;
+
+  using int_vector_type = Register<int64_t, avx_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
+                             N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi64x(3 * stride, 2 * stride, stride, 0);
+  }
+
+  /*
+   * Use the packed-double permute function because there isn't one
+   * specifically for int64
+   *
+   * Just adds a bunch of casting, should be same cost
+   */
+  template <int perm>
+  RAJA_INLINE __m256i permute(__m256i x) const
+  {
+    return _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(x), perm));
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 4;
+
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm256_setzero_si256()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0, element_type x1, element_type x2, element_type x3)
+      : m_value(_mm256_set_epi64x(x3, x2, x1, x0))
+  {}
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_epi64x(c)) {}
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(ptr));
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    m_value = _mm256_castpd_si256(_mm256_maskload_pd(
+        reinterpret_cast<double const*>(ptr), createMask(N)));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    for (camp::idx_t i = 0; i < 4; ++i)
+    {
+      m_value[i] = ptr[i * stride];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    m_value = _mm256_setzero_si256();
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      m_value[i] = ptr[i * stride];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_pd(reinterpret_cast<double*>(ptr), createMask(N),
+                        reinterpret_cast<__m256d>(m_value));
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 4; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+    case 0:
+      return _mm256_extract_epi64(m_value, 0);
+    case 1:
+      return _mm256_extract_epi64(m_value, 1);
+    case 2:
+      return _mm256_extract_epi64(m_value, 2);
+    case 3:
+      return _mm256_extract_epi64(m_value, 3);
+    }
+    return 0;
+  }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+    case 0:
+      m_value = _mm256_insert_epi64(m_value, value, 0);
+      break;
+    case 1:
+      m_value = _mm256_insert_epi64(m_value, value, 1);
+      break;
+    case 2:
+      m_value = _mm256_insert_epi64(m_value, value, 2);
+      break;
+    case 3:
+      m_value = _mm256_insert_epi64(m_value, value, 3);
+      break;
+    }
+
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm256_set1_epi64x(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    // no 4-way 64-bit add, but there is a 2-way SSE... split and conquer
+
+    // Low 128-bits  - use _mm256_castsi256_si128???
+    auto low_a   = _mm256_castsi256_si128(m_value);
+    auto low_b   = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_add_epi64(low_a, low_b));
+
+    // Hi 128-bits
+    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_add_epi64(hi_a, hi_b);
+
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    // no 4-way 64-bit subtract, but there is a 2-way SSE... split and conquer
+
+    // Low 128-bits  - use _mm256_castsi256_si128???
+    auto low_a   = _mm256_castsi256_si128(m_value);
+    auto low_b   = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_sub_epi64(low_a, low_b));
+
+    // Hi 128-bits
+    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_sub_epi64(hi_a, hi_b);
+
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    // AVX2 does not supply an int64_t multiply, so do it manually
+    return self_type(_mm256_set_epi64x(get(3) * b.get(3), get(2) * b.get(2),
+                                       get(1) * b.get(1), get(0) * b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi64x(get(3) / b.get(3), get(2) / b.get(2),
+                                       get(1) / b.get(1), get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi64x(
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+    // swap pairs and add
+    auto sh1 = permute<0x5>(m_value);
+
+    // Add lower 128-bits
+    auto low_a   = _mm256_castsi256_si128(m_value);
+    auto low_b   = _mm256_castsi256_si128(sh1);
+    auto res_low = _mm_add_epi64(low_a, low_b);
+
+    // Add upper 128-bits
+    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b   = _mm256_extractf128_si256(sh1, 1);
+    auto res_hi = _mm_add_epi64(hi_a, hi_b);
+
+    // Sum upper and lower
+    auto res = _mm_add_epi64(res_hi, res_low);
+
+    // add lower and upper
+    return _mm_extract_epi64(res, 0);
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
+  {
+    // AVX2 does not supply an 64bit integer max!
+    auto red = get(0);
+
+    auto v1 = get(1);
+    red     = red < v1 ? v1 : red;
+
+    auto v2 = get(2);
+    red     = red < v2 ? v2 : red;
+
+    auto v3 = get(3);
+    red     = red < v3 ? v3 : red;
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int64_t, avx_register>>;
-
-      using register_policy = avx_register;
-      using self_type = Register<int64_t, avx_register>;
-      using element_type = int64_t;
-      using register_type = __m256i;
-
-      using int_vector_type = Register<int64_t, avx_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi64x(
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi64x(3*stride, 2*stride, stride, 0);
-      }
-
-      /*
-       * Use the packed-double permute function because there isn't one
-       * specifically for int64
-       *
-       * Just adds a bunch of casting, should be same cost
-       */
-      template<int perm>
-      RAJA_INLINE
-      __m256i permute(__m256i x) const {
-        return _mm256_castpd_si256(
-            _mm256_permute_pd(_mm256_castsi256_pd(x), perm));
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 4;
-
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : base_type(),  m_value(_mm256_setzero_si256()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3) :
-        m_value(_mm256_set_epi64x(x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_epi64x(c)) {}
-
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const *>(ptr));
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_castpd_si256(
-            _mm256_maskload_pd(reinterpret_cast<double const *>(ptr), createMask(N))
-        );
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        for(camp::idx_t i = 0;i < 4;++ i){
-          m_value[i] = ptr[i*stride];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_setzero_si256();
-        for(camp::idx_t i = 0;i < N;++ i){
-          m_value[i] = ptr[i*stride];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_pd(reinterpret_cast<double*>(ptr), createMask(N), reinterpret_cast<__m256d>(m_value));
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 4;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: return _mm256_extract_epi64(m_value, 0);
-          case 1: return _mm256_extract_epi64(m_value, 1);
-          case 2: return _mm256_extract_epi64(m_value, 2);
-          case 3: return _mm256_extract_epi64(m_value, 3);
-        }
-        return 0;
-      }
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: m_value = _mm256_insert_epi64(m_value, value, 0); break;
-          case 1: m_value = _mm256_insert_epi64(m_value, value, 1); break;
-          case 2: m_value = _mm256_insert_epi64(m_value, value, 2); break;
-          case 3: m_value = _mm256_insert_epi64(m_value, value, 3); break;
-        }
-
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_epi64x(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        // no 4-way 64-bit add, but there is a 2-way SSE... split and conquer
-
-        // Low 128-bits  - use _mm256_castsi256_si128???
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_add_epi64(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_add_epi64(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        // no 4-way 64-bit subtract, but there is a 2-way SSE... split and conquer
-
-        // Low 128-bits  - use _mm256_castsi256_si128???
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_sub_epi64(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_sub_epi64(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        // AVX2 does not supply an int64_t multiply, so do it manually
-        return self_type(_mm256_set_epi64x(
-            get(3)*b.get(3),
-            get(2)*b.get(2),
-            get(1)*b.get(1),
-            get(0)*b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi64x(
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi64x(
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        // swap pairs and add
-        auto sh1 = permute<0x5>(m_value);
-
-        // Add lower 128-bits
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(sh1);
-        auto res_low = _mm_add_epi64(low_a, low_b);
-
-        // Add upper 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(sh1, 1);
-        auto res_hi = _mm_add_epi64(hi_a, hi_b);
-
-        // Sum upper and lower
-        auto res = _mm_add_epi64(res_hi, res_low);
-
-        // add lower and upper
-        return _mm_extract_epi64(res, 0);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        // AVX2 does not supply an 64bit integer max!
-        auto red = get(0);
-
-        auto v1 = get(1);
-        red = red < v1 ? v1 : red;
-
-        auto v2 = get(2);
-        red = red < v2 ? v2 : red;
-
-        auto v3 = get(3);
-        red = red < v3 ? v3 : red;
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        if(N <= 0 || N > 4){
-          return RAJA::operators::limits<int64_t>::min();
-        }
-
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        if(N > 1){
-          auto v1 = get(1);
-          red = red < v1 ? v1 : red;
-        }
-        if(N > 2){
-          auto v2 = get(2);
-          red = red < v2 ? v2 : red;
-        }
-        if(N > 3){
-          auto v3 = get(3);
-          red = red < v3 ? v3 : red;
-        }
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-          return self_type(_mm256_set_epi64x(
-              get(3) > a.get(3) ? get(3) : a.get(3),
-              get(2) > a.get(2) ? get(2) : a.get(2),
-              get(1) > a.get(1) ? get(1) : a.get(1),
-              get(0) > a.get(0) ? get(0) : a.get(0) ));
-        
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        auto v1 = get(1);
-        red = red > v1 ? v1 : red;
-
-        auto v2 = get(2);
-        red = red > v2 ? v2 : red;
-
-        auto v3 = get(3);
-        red = red > v3 ? v3 : red;
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        if(N <= 0 || N > 4){
-          return RAJA::operators::limits<int64_t>::max();
-        }
-
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        if(N > 1){
-          auto v1 = get(1);
-          red = red > v1 ? v1 : red;
-        }
-        if(N > 2){
-          auto v2 = get(2);
-          red = red > v2 ? v2 : red;
-        }
-        if(N > 3){
-          auto v3 = get(3);
-          red = red > v3 ? v3 : red;
-        }
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-          return self_type(_mm256_set_epi64x(
-              get(3) < a.get(3) ? get(3) : a.get(3),
-              get(2) < a.get(2) ? get(2) : a.get(2),
-              get(1) < a.get(1) ? get(1) : a.get(1),
-              get(0) < a.get(0) ? get(0) : a.get(0) ));
-        
-      }
-  };
-
-
-}   // namespace expt
+    if (N <= 0 || N > 4)
+    {
+      return RAJA::operators::limits<int64_t>::min();
+    }
+
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    if (N > 1)
+    {
+      auto v1 = get(1);
+      red     = red < v1 ? v1 : red;
+    }
+    if (N > 2)
+    {
+      auto v2 = get(2);
+      red     = red < v2 ? v2 : red;
+    }
+    if (N > 3)
+    {
+      auto v3 = get(3);
+      red     = red < v3 ? v3 : red;
+    }
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_set_epi64x(get(3) > a.get(3) ? get(3) : a.get(3),
+                                       get(2) > a.get(2) ? get(2) : a.get(2),
+                                       get(1) > a.get(1) ? get(1) : a.get(1),
+                                       get(0) > a.get(0) ? get(0) : a.get(0)));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    auto v1 = get(1);
+    red     = red > v1 ? v1 : red;
+
+    auto v2 = get(2);
+    red     = red > v2 ? v2 : red;
+
+    auto v3 = get(3);
+    red     = red > v3 ? v3 : red;
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    if (N <= 0 || N > 4)
+    {
+      return RAJA::operators::limits<int64_t>::max();
+    }
+
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    if (N > 1)
+    {
+      auto v1 = get(1);
+      red     = red > v1 ? v1 : red;
+    }
+    if (N > 2)
+    {
+      auto v2 = get(2);
+      red     = red > v2 ? v2 : red;
+    }
+    if (N > 3)
+    {
+      auto v3 = get(3);
+      red     = red > v3 ? v3 : red;
+    }
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_set_epi64x(get(3) < a.get(3) ? get(3) : a.get(3),
+                                       get(2) < a.get(2) ? get(2) : a.get(2),
+                                       get(1) < a.get(1) ? get(1) : a.get(1),
+                                       get(0) < a.get(0) ? get(0) : a.get(0)));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx/traits.hpp b/include/RAJA/policy/tensor/arch/avx/traits.hpp
index 33c18e2c5f..ad0c7b3d26 100644
--- a/include/RAJA/policy/tensor/arch/avx/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/traits.hpp
@@ -20,52 +20,59 @@
 #ifndef RAJA_policy_tensor_arch_avx_traits_HPP
 #define RAJA_policy_tensor_arch_avx_traits_HPP
 
-namespace RAJA {
-namespace internal {
-namespace expt {
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx_register, int32_t>{
-      using element_type = int32_t;
-      using register_policy = RAJA::expt::avx_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int32_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx_register, int32_t>
+{
+  using element_type                      = int32_t;
+  using register_policy                   = RAJA::expt::avx_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type                  = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx_register, int64_t>{
-      using element_type = int64_t;
-      using register_policy = RAJA::expt::avx_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 4;
-      using int_element_type = int64_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx_register, int64_t>
+{
+  using element_type                      = int64_t;
+  using register_policy                   = RAJA::expt::avx_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 4;
+  using int_element_type                  = int64_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx_register, float>{
-      using element_type = float;
-      using register_policy = RAJA::expt::avx_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int32_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx_register, float>
+{
+  using element_type                      = float;
+  using register_policy                   = RAJA::expt::avx_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type                  = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx_register, double>{
-      using element_type = double;
-      using register_policy = RAJA::expt::avx_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 4;
-      using int_element_type = int64_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx_register, double>
+{
+  using element_type                      = double;
+  using register_policy                   = RAJA::expt::avx_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 4;
+  using int_element_type                  = int64_t;
+};
 
-} // namespace intenral
-} // namespace expt
-} // namespace RAJA
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 
 #endif
 
 
-#endif // __AVX__
+#endif  // __AVX__
diff --git a/include/RAJA/policy/tensor/arch/avx2.hpp b/include/RAJA/policy/tensor/arch/avx2.hpp
index b462257924..4ae2ca6bdd 100644
--- a/include/RAJA/policy/tensor/arch/avx2.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2.hpp
@@ -17,11 +17,11 @@
 
 #ifdef __AVX2__
 
-#include<RAJA/policy/tensor/arch/avx2/traits.hpp>
-#include<RAJA/policy/tensor/arch/avx2/avx2_int32.hpp>
-#include<RAJA/policy/tensor/arch/avx2/avx2_int64.hpp>
-#include<RAJA/policy/tensor/arch/avx2/avx2_float.hpp>
-#include<RAJA/policy/tensor/arch/avx2/avx2_double.hpp>
+#include <RAJA/policy/tensor/arch/avx2/traits.hpp>
+#include <RAJA/policy/tensor/arch/avx2/avx2_int32.hpp>
+#include <RAJA/policy/tensor/arch/avx2/avx2_int64.hpp>
+#include <RAJA/policy/tensor/arch/avx2/avx2_float.hpp>
+#include <RAJA/policy/tensor/arch/avx2/avx2_double.hpp>
 
 
-#endif // __AVX2__
+#endif  // __AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
index 852003a4f9..eba85017e0 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
@@ -34,529 +34,546 @@ namespace RAJA
 namespace expt
 {
 
-  template<>
-  class Register<double, avx2_register> :
-    public internal::expt::RegisterBase<Register<double, avx2_register>>
+template <>
+class Register<double, avx2_register>
+    : public internal::expt::RegisterBase<Register<double, avx2_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<double, avx2_register>>;
+
+  using register_policy = avx2_register;
+  using self_type       = Register<double, avx2_register>;
+  using element_type    = double;
+  using register_type   = __m256d;
+
+  using int_vector_type = Register<int64_t, avx2_register>;
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
+                             N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi64x(3 * stride, 2 * stride, stride, 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 4;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : m_value(_mm256_setzero_pd()) {}
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0, element_type x1, element_type x2, element_type x3)
+      : m_value(_mm256_set_pd(x3, x2, x1, x0))
+  {}
+
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(c), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_pd(c)) {}
+
+
+  /*!
+   * @brief Returns underlying SIMD register.
+   */
+  RAJA_INLINE
+  constexpr register_type get_register() const { return m_value; }
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<double, avx2_register>>;
-
-      using register_policy = avx2_register;
-      using self_type = Register<double, avx2_register>;
-      using element_type = double;
-      using register_type = __m256d;
-
-      using int_vector_type = Register<int64_t, avx2_register>;
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi64x(
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi64x(3*stride, 2*stride, stride, 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 4;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : m_value(_mm256_setzero_pd()) {
-      }
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3) :
-        m_value(_mm256_set_pd(x3,x2,x1,x0))
-      {}
-
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(c), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_pd(c)) {}
-
-
-      /*!
-       * @brief Returns underlying SIMD register.
-       */
-      RAJA_INLINE
-      constexpr
-      register_type get_register() const {
-        return m_value;
-      }
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_packed ++;
+    RAJA::tensor_stats::num_vector_load_packed++;
 #endif
-        m_value = _mm256_loadu_pd(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
+    m_value = _mm256_loadu_pd(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_packed_n ++;
+    RAJA::tensor_stats::num_vector_load_packed_n++;
 #endif
-        m_value = _mm256_maskload_pd(ptr, createMask(N));
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
+    m_value = _mm256_maskload_pd(ptr, createMask(N));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided ++;
+    RAJA::tensor_stats::num_vector_load_strided++;
 #endif
-        m_value = _mm256_i64gather_pd(ptr,
-                                      createStridedOffsets(stride),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
+    m_value = _mm256_i64gather_pd(ptr, createStridedOffsets(stride),
+                                  sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        m_value = _mm256_mask_i64gather_pd(_mm256_setzero_pd(),
-                                      ptr,
-                                      createStridedOffsets(stride),
-                                      _mm256_castsi256_pd(createMask(N)),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
+    m_value = _mm256_mask_i64gather_pd(
+        _mm256_setzero_pd(), ptr, createStridedOffsets(stride),
+        _mm256_castsi256_pd(createMask(N)), sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type& gather(element_type const* ptr, int_vector_type offsets)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        m_value = _mm256_i64gather_pd(ptr,
-                                      offsets.get_register(),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
+    m_value =
+        _mm256_i64gather_pd(ptr, offsets.get_register(), sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  gather_n(element_type const* ptr, int_vector_type offsets, camp::idx_t N)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        m_value = _mm256_mask_i64gather_pd(_mm256_setzero_pd(),
-                                      ptr,
-                                      offsets.get_register(),
-                                      _mm256_castsi256_pd(createMask(N)),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
+    m_value = _mm256_mask_i64gather_pd(
+        _mm256_setzero_pd(), ptr, offsets.get_register(),
+        _mm256_castsi256_pd(createMask(N)), sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_packed ++;
+    RAJA::tensor_stats::num_vector_store_packed++;
 #endif
-        _mm256_storeu_pd(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
+    _mm256_storeu_pd(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_packed_n ++;
+    RAJA::tensor_stats::num_vector_store_packed_n++;
 #endif
-        _mm256_maskstore_pd(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
+    _mm256_maskstore_pd(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_strided ++;
+    RAJA::tensor_stats::num_vector_store_strided++;
 #endif
-        for(camp::idx_t i = 0;i < 4;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
+    for (camp::idx_t i = 0; i < 4; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_strided_n ++;
+    RAJA::tensor_stats::num_vector_store_strided_n++;
 #endif
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_pd(value);
-        return *this;
-      }
-
-      /*!
-       * @brief Extracts a scalar value and broadcasts to a new register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type get_and_broadcast(int i) const {
-        switch(i){
-          case 0: return self_type(_mm256_permute4x64_pd (m_value, 0x00));
-          case 1: return self_type(_mm256_permute4x64_pd (m_value, 0x55));
-          case 2: return self_type(_mm256_permute4x64_pd (m_value, 0xAA));
-          case 3: return self_type(_mm256_permute4x64_pd (m_value, 0xFF));
-        }
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm256_mul_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm256_div_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply a masked divide, so do it manually
-        return self_type(_mm256_set_pd(
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm256_set1_pd(value);
+    return *this;
+  }
+
+  /*!
+   * @brief Extracts a scalar value and broadcasts to a new register
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type get_and_broadcast(int i) const
+  {
+    switch (i)
+    {
+    case 0:
+      return self_type(_mm256_permute4x64_pd(m_value, 0x00));
+    case 1:
+      return self_type(_mm256_permute4x64_pd(m_value, 0x55));
+    case 2:
+      return self_type(_mm256_permute4x64_pd(m_value, 0xAA));
+    case 3:
+      return self_type(_mm256_permute4x64_pd(m_value, 0xFF));
+    }
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm256_add_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm256_sub_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm256_mul_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(_mm256_div_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply a masked divide, so do it manually
+    return self_type(_mm256_set_pd(
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
 
 // only use FMA's if the compiler has them turned on
 #ifdef __FMA__
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm256_fmadd_pd(m_value, b.m_value, c.m_value));
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm256_fmsub_pd(m_value, b.m_value, c.m_value));
-      }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm256_fmadd_pd(m_value, b.m_value, c.m_value));
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm256_fmsub_pd(m_value, b.m_value, c.m_value));
+  }
 #endif
 
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum(camp::idx_t = 4) const
-      {
-        auto sh1 = _mm256_permute_pd(m_value, 0x5);
-        auto red1 = _mm256_add_pd(m_value, sh1);
-        return red1[0]+red1[2];
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max(camp::idx_t N = 4) const
-      {
-        if(N == 4){
-          // permute the first two and last two lanes of the register
-          // A = { v[1], v[0], v[3], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-          // take the maximum value of each lane
-          // B = { max{v[0], v[1]},
-          //       max{v[0], v[1]},
-          //       max{v[2], v[3]},
-          //       max{v[2], v[3]} }
-          register_type b = _mm256_max_pd(m_value, a);
-
-          // now take the maximum of a lower and upper halves
-          return RAJA::max<element_type>(b[0], b[2]);
-        }
-        else if(N == 3){
-          // permute the first two and last two lanes of the register
-          // use the third element TWICE, so we effectively remove the 4th
-          // lane
-          // A = { v[1], v[0], v[2], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
-
-          // take the maximum value of each lane
-          // B = { max{v[0], v[1]},
-          //       max{v[0], v[1]},
-          //       max{v[2], v[2]},   <-- just v[2]
-          //       max{v[2], v[3]} }
-          register_type b = _mm256_max_pd(m_value, a);
-
-          // now take the maximum of a lower and upper lane
-          return RAJA::max<element_type>(b[0], b[2]);
-        }
-        else if(N == 2){
-          return RAJA::max<element_type>(m_value[0], m_value[1]);
-        }
-        else if(N == 1){
-          return m_value[0];
-        }
-        return RAJA::operators::limits<double>::min();
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm256_max_pd(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        // permute the first two and last two lanes of the register
-        // A = { v[1], v[0], v[3], v[2] }
-        register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-        // take the minimum value of each lane
-        // B = { min{v[0], v[1]},
-        //       min{v[0], v[1]},
-        //       min{v[2], v[3]},
-        //       min{v[2], v[3]} }
-        register_type b = _mm256_min_pd(m_value, a);
-
-        // now take the minimum of a lower and upper halves
-        return RAJA::min<element_type>(b[0], b[2]);
-      }
-
-      /*!
-       * @brief Returns the largest element from first N lanes
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        if(N == 4){
-          // permute the first two and last two lanes of the register
-          // A = { v[1], v[0], v[3], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-          // take the minimum value of each lane
-          // B = { min{v[0], v[1]},
-          //       min{v[0], v[1]},
-          //       min{v[2], v[3]},
-          //       min{v[2], v[3]} }
-          register_type b = _mm256_min_pd(m_value, a);
-
-          // now take the minimum of a lower and upper halves
-          return std::min<element_type>(b[0], b[2]);
-        }
-        else if(N == 3){
-          // permute the first two and last two lanes of the register
-          // use the third element TWICE, so we effectively remove the 4th
-          // lane
-          // A = { v[1], v[0], v[2], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
-
-          // take the minimum value of each lane
-          // B = { min{v[0], v[1]},
-          //       min{v[0], v[1]},
-          //       min{v[2], v[2]},   <-- just v[2]
-          //       min{v[2], v[3]} }
-          register_type b = _mm256_min_pd(m_value, a);
-
-          // now take the minimum of a lower and upper lane
-          return std::min<element_type>(b[0], b[2]);
-        }
-        else if(N == 2){
-          return std::min<element_type>(m_value[0], m_value[1]);
-        }
-        else if(N == 1){
-          return m_value[0];
-        }
-        return RAJA::operators::limits<double>::max();
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm256_min_pd(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum(camp::idx_t = 4) const
+  {
+    auto sh1  = _mm256_permute_pd(m_value, 0x5);
+    auto red1 = _mm256_add_pd(m_value, sh1);
+    return red1[0] + red1[2];
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max(camp::idx_t N = 4) const
+  {
+    if (N == 4)
+    {
+      // permute the first two and last two lanes of the register
+      // A = { v[1], v[0], v[3], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+      // take the maximum value of each lane
+      // B = { max{v[0], v[1]},
+      //       max{v[0], v[1]},
+      //       max{v[2], v[3]},
+      //       max{v[2], v[3]} }
+      register_type b = _mm256_max_pd(m_value, a);
+
+      // now take the maximum of a lower and upper halves
+      return RAJA::max<element_type>(b[0], b[2]);
+    }
+    else if (N == 3)
+    {
+      // permute the first two and last two lanes of the register
+      // use the third element TWICE, so we effectively remove the 4th
+      // lane
+      // A = { v[1], v[0], v[2], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
+
+      // take the maximum value of each lane
+      // B = { max{v[0], v[1]},
+      //       max{v[0], v[1]},
+      //       max{v[2], v[2]},   <-- just v[2]
+      //       max{v[2], v[3]} }
+      register_type b = _mm256_max_pd(m_value, a);
+
+      // now take the maximum of a lower and upper lane
+      return RAJA::max<element_type>(b[0], b[2]);
+    }
+    else if (N == 2)
+    {
+      return RAJA::max<element_type>(m_value[0], m_value[1]);
+    }
+    else if (N == 1)
+    {
+      return m_value[0];
+    }
+    return RAJA::operators::limits<double>::min();
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_max_pd(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+    // permute the first two and last two lanes of the register
+    // A = { v[1], v[0], v[3], v[2] }
+    register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+    // take the minimum value of each lane
+    // B = { min{v[0], v[1]},
+    //       min{v[0], v[1]},
+    //       min{v[2], v[3]},
+    //       min{v[2], v[3]} }
+    register_type b = _mm256_min_pd(m_value, a);
+
+    // now take the minimum of a lower and upper halves
+    return RAJA::min<element_type>(b[0], b[2]);
+  }
+
+  /*!
+   * @brief Returns the largest element from first N lanes
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    if (N == 4)
+    {
+      // permute the first two and last two lanes of the register
+      // A = { v[1], v[0], v[3], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+      // take the minimum value of each lane
+      // B = { min{v[0], v[1]},
+      //       min{v[0], v[1]},
+      //       min{v[2], v[3]},
+      //       min{v[2], v[3]} }
+      register_type b = _mm256_min_pd(m_value, a);
+
+      // now take the minimum of a lower and upper halves
+      return std::min<element_type>(b[0], b[2]);
+    }
+    else if (N == 3)
+    {
+      // permute the first two and last two lanes of the register
+      // use the third element TWICE, so we effectively remove the 4th
+      // lane
+      // A = { v[1], v[0], v[2], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
+
+      // take the minimum value of each lane
+      // B = { min{v[0], v[1]},
+      //       min{v[0], v[1]},
+      //       min{v[2], v[2]},   <-- just v[2]
+      //       min{v[2], v[3]} }
+      register_type b = _mm256_min_pd(m_value, a);
+
+      // now take the minimum of a lower and upper lane
+      return std::min<element_type>(b[0], b[2]);
+    }
+    else if (N == 2)
+    {
+      return std::min<element_type>(m_value[0], m_value[1]);
+    }
+    else if (N == 1)
+    {
+      return m_value[0];
+    }
+    return RAJA::operators::limits<double>::max();
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_min_pd(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
index 4b1e11419d..77d814e293 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
@@ -33,487 +33,486 @@ namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<float, avx2_register> :
-    public internal::expt::RegisterBase<Register<float, avx2_register>>
+template <>
+class Register<float, avx2_register>
+    : public internal::expt::RegisterBase<Register<float, avx2_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<float, avx2_register>>;
+
+  using register_policy = avx2_register;
+  using self_type       = Register<float, avx2_register>;
+  using element_type    = float;
+  using register_type   = __m256;
+
+  using int_vector_type = Register<int32_t, avx2_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi32(N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0,
+                            N >= 5 ? -1 : 0, N >= 4 ? -1 : 0, N >= 3 ? -1 : 0,
+                            N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi32(7 * stride, 6 * stride, 5 * stride, 4 * stride,
+                            3 * stride, 2 * stride, stride, 0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute1(camp::idx_t N) const
+  {
+    // Generate a permutation for first round of min/max routines
+    return _mm256_set_epi32(N >= 7 ? 6 : 0, N >= 8 ? 7 : 0, N >= 5 ? 4 : 0,
+                            N >= 6 ? 5 : 0, N >= 3 ? 2 : 0, N >= 4 ? 3 : 0,
+                            N >= 1 ? 0 : 0, N >= 2 ? 1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute2(camp::idx_t N) const
+  {
+    // Generate a permutation for second round of min/max routines
+    return _mm256_set_epi32(N >= 6 ? 5 : 0, N >= 5 ? 4 : 0, N >= 8 ? 7 : 0,
+                            N >= 7 ? 6 : 0, N >= 2 ? 1 : 0, N >= 1 ? 0 : 0,
+                            N >= 4 ? 3 : 0, N >= 2 ? 2 : 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : m_value(_mm256_setzero_ps()) {}
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0,
+           element_type x1,
+           element_type x2,
+           element_type x3,
+           element_type x4,
+           element_type x5,
+           element_type x6,
+           element_type x7)
+      : m_value(_mm256_set_ps(x7, x6, x5, x4, x3, x2, x1, x0))
+  {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(c), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_ps(c)) {}
+
+  /*!
+   * @brief Returns underlying SIMD register.
+   */
+  RAJA_INLINE
+  constexpr register_type get_register() const { return m_value; }
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = _mm256_loadu_ps(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    m_value = _mm256_maskload_ps(ptr, createMask(N));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    m_value = _mm256_i32gather_ps(ptr, createStridedOffsets(stride),
+                                  sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<float, avx2_register>>;
-
-      using register_policy = avx2_register;
-      using self_type = Register<float, avx2_register>;
-      using element_type = float;
-      using register_type = __m256;
-
-      using int_vector_type = Register<int32_t, avx2_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi32(
-            N >= 8 ? -1 : 0,
-            N >= 7 ? -1 : 0,
-            N >= 6 ? -1 : 0,
-            N >= 5 ? -1 : 0,
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi32(
-            7*stride, 6*stride, 5*stride, 4*stride,
-            3*stride, 2*stride, stride, 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute1(camp::idx_t N) const {
-        // Generate a permutation for first round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 7 ? 6 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 6 ? 5 : 0,
-            N >= 3 ? 2 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 2 ? 1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute2(camp::idx_t N) const {
-        // Generate a permutation for second round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 6 ? 5 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 7 ? 6 : 0,
-            N >= 2 ? 1 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 2 ? 2 : 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : m_value(_mm256_setzero_ps()) {
-      }
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3,
-                     element_type x4,
-                     element_type x5,
-                     element_type x6,
-                     element_type x7) :
-        m_value(_mm256_set_ps(x7,x6,x5,x4,x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(c), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_ps(c)) {}
-
-      /*!
-       * @brief Returns underlying SIMD register.
-       */
-      RAJA_INLINE
-      constexpr
-      register_type get_register() const {
-        return m_value;
-      }
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_ps(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_maskload_ps(ptr, createMask(N));
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        m_value = _mm256_i32gather_ps(ptr,
-                                      createStridedOffsets(stride),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_mask_i32gather_ps(_mm256_setzero_ps(),
-                                      ptr,
-                                      createStridedOffsets(stride),
-                                      _mm256_castsi256_ps(createMask(N)),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_ps(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_ps(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 8;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_ps(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm256_mul_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm256_div_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N ) const {
-        // AVX2 does not supply a masked divide
-        return self_type(_mm256_set_ps(
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
+    m_value = _mm256_mask_i32gather_ps(
+        _mm256_setzero_ps(), ptr, createStridedOffsets(stride),
+        _mm256_castsi256_ps(createMask(N)), sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    _mm256_storeu_ps(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_ps(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 8; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm256_set1_ps(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm256_add_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm256_sub_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm256_mul_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(_mm256_div_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply a masked divide
+    return self_type(_mm256_set_ps(
+        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
+        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
 
 // only use FMA's if the compiler has them turned on
 #ifdef __FMA__
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm256_fmadd_ps(m_value, b.m_value, c.m_value));
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm256_fmsub_ps(m_value, b.m_value, c.m_value));
-      }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm256_fmadd_ps(m_value, b.m_value, c.m_value));
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm256_fmsub_ps(m_value, b.m_value, c.m_value));
+  }
 #endif
 
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-        auto red1 = _mm256_add_ps(m_value, sh1);
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_add_ps(red1, sh2);
-
-        return red2[0] + red2[4];
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(8));
-        auto red1 = _mm256_max_ps(m_value, sh1);
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(8));
-        auto red2 = _mm256_max_ps(red1, sh2);
-
-        return std::max<element_type>(red2[0], red2[4]);
-
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N >8){
-          return RAJA::operators::limits<float>::min();
-        }
-        if(N == 1){
-          return m_value[0];
-        }
-        if(N == 2){
-          return std::max<element_type>(m_value[0], m_value[1]);
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(N));
-        auto red1 = _mm256_max_ps(m_value, sh1);
-
-        if(N == 3){
-          return std::max<element_type>(red1[0], m_value[2]);
-        }
-        if(N == 4){
-          return std::max<element_type>(red1[0], red1[2]);
-        }
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(N));
-        auto red2 = _mm256_max_ps(red1, sh2);
-
-        return std::max<element_type>(red2[0], red2[4]);
-
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm256_max_ps(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(8));
-        auto red1 = _mm256_min_ps(m_value, sh1);
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(8));
-        auto red2 = _mm256_min_ps(red1, sh2);
-
-        return std::min<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N >8){
-          return RAJA::operators::limits<float>::max();
-        }
-        if(N == 1){
-          return m_value[0];
-        }
-        if(N == 2){
-          return std::min<element_type>(m_value[0], m_value[1]);
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(N));
-        auto red1 = _mm256_min_ps(m_value, sh1);
-
-        if(N == 3){
-          return std::min<element_type>(red1[0], m_value[2]);
-        }
-        if(N == 4){
-          return std::min<element_type>(red1[0], red1[2]);
-        }
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(N));
-        auto red2 = _mm256_min_ps(red1, sh2);
-
-        return std::min<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm256_min_ps(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permute_ps(m_value, 0xB1);
+    auto red1 = _mm256_add_ps(m_value, sh1);
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_add_ps(red1, sh2);
+
+    return red2[0] + red2[4];
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
+  {
+
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permutevar8x32_ps(m_value, createPermute1(8));
+    auto red1 = _mm256_max_ps(m_value, sh1);
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permutevar8x32_ps(red1, createPermute2(8));
+    auto red2 = _mm256_max_ps(red1, sh2);
+
+    return std::max<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<float>::min();
+    }
+    if (N == 1)
+    {
+      return m_value[0];
+    }
+    if (N == 2)
+    {
+      return std::max<element_type>(m_value[0], m_value[1]);
+    }
+
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permutevar8x32_ps(m_value, createPermute1(N));
+    auto red1 = _mm256_max_ps(m_value, sh1);
+
+    if (N == 3)
+    {
+      return std::max<element_type>(red1[0], m_value[2]);
+    }
+    if (N == 4)
+    {
+      return std::max<element_type>(red1[0], red1[2]);
+    }
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permutevar8x32_ps(red1, createPermute2(N));
+    auto red2 = _mm256_max_ps(red1, sh2);
+
+    return std::max<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_max_ps(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permutevar8x32_ps(m_value, createPermute1(8));
+    auto red1 = _mm256_min_ps(m_value, sh1);
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permutevar8x32_ps(red1, createPermute2(8));
+    auto red2 = _mm256_min_ps(red1, sh2);
+
+    return std::min<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<float>::max();
+    }
+    if (N == 1)
+    {
+      return m_value[0];
+    }
+    if (N == 2)
+    {
+      return std::min<element_type>(m_value[0], m_value[1]);
+    }
+
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permutevar8x32_ps(m_value, createPermute1(N));
+    auto red1 = _mm256_min_ps(m_value, sh1);
+
+    if (N == 3)
+    {
+      return std::min<element_type>(red1[0], m_value[2]);
+    }
+    if (N == 4)
+    {
+      return std::min<element_type>(red1[0], red1[2]);
+    }
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permutevar8x32_ps(red1, createPermute2(N));
+    auto red2 = _mm256_min_ps(red1, sh2);
+
+    return std::min<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_min_ps(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
index ab5948a3f2..fbc671b127 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
@@ -34,535 +34,562 @@ namespace RAJA
 namespace expt
 {
 
-  template<>
-  class Register<int32_t, avx2_register> :
-    public internal::expt::RegisterBase<Register<int32_t, avx2_register>>
+template <>
+class Register<int32_t, avx2_register>
+    : public internal::expt::RegisterBase<Register<int32_t, avx2_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int32_t, avx2_register>>;
+
+  using register_policy = avx2_register;
+  using self_type       = Register<int32_t, avx2_register>;
+  using element_type    = int32_t;
+  using register_type   = __m256i;
+
+  using int_vector_type = Register<int32_t, avx2_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi32(N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0,
+                            N >= 5 ? -1 : 0, N >= 4 ? -1 : 0, N >= 3 ? -1 : 0,
+                            N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi32(7 * stride, 6 * stride, 5 * stride, 4 * stride,
+                            3 * stride, 2 * stride, stride, 0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute1(camp::idx_t N) const
+  {
+    // Generate a permutation for first round of min/max routines
+    return _mm256_set_epi32(N >= 7 ? 6 : 0, N >= 8 ? 7 : 0, N >= 5 ? 4 : 0,
+                            N >= 6 ? 5 : 0, N >= 3 ? 2 : 0, N >= 4 ? 3 : 0,
+                            N >= 1 ? 0 : 0, N >= 2 ? 1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute2(camp::idx_t N) const
+  {
+    // Generate a permutation for second round of min/max routines
+    return _mm256_set_epi32(N >= 6 ? 5 : 0, N >= 5 ? 4 : 0, N >= 8 ? 7 : 0,
+                            N >= 7 ? 6 : 0, N >= 2 ? 1 : 0, N >= 1 ? 0 : 0,
+                            N >= 4 ? 3 : 0, N >= 2 ? 2 : 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : m_value(_mm256_setzero_si256()) {}
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0,
+           element_type x1,
+           element_type x2,
+           element_type x3,
+           element_type x4,
+           element_type x5,
+           element_type x6,
+           element_type x7)
+      : m_value(_mm256_set_epi32(x7, x6, x5, x4, x3, x2, x1, x0))
+  {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(c), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_epi32(c)) {}
+
+
+  /*!
+   * @brief Returns underlying SIMD register.
+   */
+  RAJA_INLINE
+  constexpr register_type get_register() const { return m_value; }
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = _mm256_loadu_si256((__m256i const*)ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    m_value = _mm256_maskload_epi32(ptr, createMask(N));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    m_value = _mm256_i32gather_epi32(ptr, createStridedOffsets(stride),
+                                     sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    m_value = _mm256_mask_i32gather_epi32(_mm256_setzero_si256(), ptr,
+                                          createStridedOffsets(stride),
+                                          createMask(N), sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_epi32(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 8; ++i)
+    {
+      ptr[i * stride] = get(i);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = get(i);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+    case 0:
+      return _mm256_extract_epi32(m_value, 0);
+    case 1:
+      return _mm256_extract_epi32(m_value, 1);
+    case 2:
+      return _mm256_extract_epi32(m_value, 2);
+    case 3:
+      return _mm256_extract_epi32(m_value, 3);
+    case 4:
+      return _mm256_extract_epi32(m_value, 4);
+    case 5:
+      return _mm256_extract_epi32(m_value, 5);
+    case 6:
+      return _mm256_extract_epi32(m_value, 6);
+    case 7:
+      return _mm256_extract_epi32(m_value, 7);
+    }
+    return 0;
+  }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+    case 0:
+      m_value = _mm256_insert_epi32(m_value, value, 0);
+      break;
+    case 1:
+      m_value = _mm256_insert_epi32(m_value, value, 1);
+      break;
+    case 2:
+      m_value = _mm256_insert_epi32(m_value, value, 2);
+      break;
+    case 3:
+      m_value = _mm256_insert_epi32(m_value, value, 3);
+      break;
+    case 4:
+      m_value = _mm256_insert_epi32(m_value, value, 4);
+      break;
+    case 5:
+      m_value = _mm256_insert_epi32(m_value, value, 5);
+      break;
+    case 6:
+      m_value = _mm256_insert_epi32(m_value, value, 6);
+      break;
+    case 7:
+      m_value = _mm256_insert_epi32(m_value, value, 7);
+      break;
+    }
+
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm256_set1_epi32(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm256_add_epi32(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm256_sub_epi32(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+
+    // the AVX2 epi32 multiply only multiplies the even elements
+    // and provides 64-bit results
+    // need to do some repacking to get this to work
+
+    // multiply 0, 2, 4, 6
+    auto prod_even = _mm256_mul_epi32(m_value, b.m_value);
+
+    // Swap 32-bit words
+    auto sh_a = _mm256_castps_si256(
+        _mm256_permute_ps(_mm256_castsi256_ps(m_value), 0xB1));
+
+    auto sh_b = _mm256_castps_si256(
+        _mm256_permute_ps(_mm256_castsi256_ps(b.m_value), 0xB1));
+
+    // multiply 1, 3, 5, 7
+    auto prod_odd = _mm256_mul_epi32(sh_a, sh_b);
+
+    // Stitch prod_odd and prod_even back together
+    auto sh_odd = _mm256_castps_si256(
+        _mm256_permute_ps(_mm256_castsi256_ps(prod_odd), 0xB1));
+
+    return self_type(_mm256_blend_epi32(prod_even, sh_odd, 0xAA));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi32(get(7) / b.get(7), get(6) / b.get(6),
+                                      get(5) / b.get(5), get(4) / b.get(4),
+                                      get(3) / b.get(3), get(2) / b.get(2),
+                                      get(1) / b.get(1), get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi32(
+        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
+        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_castps_si256(
+        _mm256_permute_ps(_mm256_castsi256_ps(m_value), 0xB1));
+    auto red1 = _mm256_add_epi32(m_value, sh1);
+
+
+    // swap odd-even quads and add
+    auto sh2 =
+        _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(red1), 0x4E));
+    auto red2 = _mm256_add_epi32(red1, sh2);
+
+    return _mm256_extract_epi32(red2, 0) + _mm256_extract_epi32(red2, 4);
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int32_t, avx2_register>>;
-
-      using register_policy = avx2_register;
-      using self_type = Register<int32_t, avx2_register>;
-      using element_type = int32_t;
-      using register_type = __m256i;
-
-      using int_vector_type = Register<int32_t, avx2_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi32(
-            N >= 8 ? -1 : 0,
-            N >= 7 ? -1 : 0,
-            N >= 6 ? -1 : 0,
-            N >= 5 ? -1 : 0,
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi32(
-            7*stride, 6*stride, 5*stride, 4*stride,
-            3*stride, 2*stride, stride, 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute1(camp::idx_t N) const {
-        // Generate a permutation for first round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 7 ? 6 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 6 ? 5 : 0,
-            N >= 3 ? 2 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 2 ? 1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute2(camp::idx_t N) const {
-        // Generate a permutation for second round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 6 ? 5 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 7 ? 6 : 0,
-            N >= 2 ? 1 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 2 ? 2 : 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : m_value(_mm256_setzero_si256()) {
-      }
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3,
-                     element_type x4,
-                     element_type x5,
-                     element_type x6,
-                     element_type x7) :
-        m_value(_mm256_set_epi32(x7,x6,x5,x4,x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(c), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_epi32(c)) {}
-
-
-      /*!
-       * @brief Returns underlying SIMD register.
-       */
-      RAJA_INLINE
-      constexpr
-      register_type get_register() const {
-        return m_value;
-      }
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_si256((__m256i const *)ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_maskload_epi32(ptr, createMask(N));
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        m_value = _mm256_i32gather_epi32(ptr,
-                                      createStridedOffsets(stride),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_mask_i32gather_epi32(_mm256_setzero_si256(),
-                                      ptr,
-                                      createStridedOffsets(stride),
-                                      createMask(N),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_epi32(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 8;++ i){
-          ptr[i*stride] = get(i);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = get(i);
-        }
-        return *this;
-      }
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: return _mm256_extract_epi32(m_value, 0);
-          case 1: return _mm256_extract_epi32(m_value, 1);
-          case 2: return _mm256_extract_epi32(m_value, 2);
-          case 3: return _mm256_extract_epi32(m_value, 3);
-          case 4: return _mm256_extract_epi32(m_value, 4);
-          case 5: return _mm256_extract_epi32(m_value, 5);
-          case 6: return _mm256_extract_epi32(m_value, 6);
-          case 7: return _mm256_extract_epi32(m_value, 7);
-        }
-        return 0;
-      }
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: m_value = _mm256_insert_epi32(m_value, value, 0); break;
-          case 1: m_value = _mm256_insert_epi32(m_value, value, 1); break;
-          case 2: m_value = _mm256_insert_epi32(m_value, value, 2); break;
-          case 3: m_value = _mm256_insert_epi32(m_value, value, 3); break;
-          case 4: m_value = _mm256_insert_epi32(m_value, value, 4); break;
-          case 5: m_value = _mm256_insert_epi32(m_value, value, 5); break;
-          case 6: m_value = _mm256_insert_epi32(m_value, value, 6); break;
-          case 7: m_value = _mm256_insert_epi32(m_value, value, 7); break;
-        }
-
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_epi32(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_epi32(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_epi32(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-
-        // the AVX2 epi32 multiply only multiplies the even elements
-        // and provides 64-bit results
-        // need to do some repacking to get this to work
-
-        // multiply 0, 2, 4, 6
-        auto prod_even = _mm256_mul_epi32(m_value, b.m_value);
-
-        // Swap 32-bit words
-        auto sh_a = _mm256_castps_si256(
-            _mm256_permute_ps(_mm256_castsi256_ps(m_value), 0xB1));
-
-        auto sh_b = _mm256_castps_si256(
-                    _mm256_permute_ps(_mm256_castsi256_ps(b.m_value), 0xB1));
-
-        // multiply 1, 3, 5, 7
-        auto prod_odd = _mm256_mul_epi32(sh_a, sh_b);
-
-        // Stitch prod_odd and prod_even back together
-        auto sh_odd = _mm256_castps_si256(
-                    _mm256_permute_ps(_mm256_castsi256_ps(prod_odd), 0xB1));
-
-        return self_type(_mm256_blend_epi32(prod_even, sh_odd, 0xAA));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi32(
-            get(7)/b.get(7),
-            get(6)/b.get(6),
-            get(5)/b.get(5),
-            get(4)/b.get(4),
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi32(
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_castps_si256(
-            _mm256_permute_ps(_mm256_castsi256_ps(m_value), 0xB1) );
-        auto red1 = _mm256_add_epi32(m_value, sh1);
-
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_castps_si256(
-            _mm256_permute_ps(_mm256_castsi256_ps(red1), 0x4E));
-        auto red2 = _mm256_add_epi32(red1, sh2);
-
-        return _mm256_extract_epi32(red2, 0) + _mm256_extract_epi32(red2, 4);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(8));
-        auto red1 = _mm256_max_epi32(m_value, sh1);
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
-        auto red2 = _mm256_max_epi32(red1, sh2);
-
-        return std::max<element_type>(_mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N > 8){
-          return RAJA::operators::limits<int32_t>::min();
-        }
-        if(N == 1){
-          return get(0);
-        }
-
-        if(N == 2){
-          return std::max<element_type>(get(0), get(1));
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(N));
-        auto red1 = _mm256_max_epi32(m_value, sh1);
-
-        if(N == 3){
-          return std::max<element_type>(_mm256_extract_epi32(red1, 0), get(2));
-        }
-        if(N == 4){
-          return std::max<element_type>(_mm256_extract_epi32(red1, 0), _mm256_extract_epi32(red1, 2));
-        }
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
-        auto red2 = _mm256_max_epi32(red1, sh2);
-
-        return std::max<element_type>(_mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm256_max_epi32(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(8));
-        auto red1 = _mm256_min_epi32(m_value, sh1);
-
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
-        auto red2 = _mm256_min_epi32(red1, sh2);
-
-        return std::min<element_type>(_mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N > 8){
-          return RAJA::operators::limits<int32_t>::max();
-        }
-        if(N == 1){
-          return get(0);
-        }
-
-        if(N == 2){
-          return std::min<element_type>(get(0), get(1));
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(N));
-        auto red1 = _mm256_min_epi32(m_value, sh1);
-
-        if(N == 3){
-          return std::min<element_type>(_mm256_extract_epi32(red1, 0), get(2));
-        }
-        if(N == 4){
-          return std::min<element_type>(_mm256_extract_epi32(red1, 0), _mm256_extract_epi32(red1, 2));
-        }
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
-        auto red2 = _mm256_min_epi32(red1, sh2);
-
-        return std::min<element_type>(_mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm256_min_epi32(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permutevar8x32_epi32(m_value, createPermute1(8));
+    auto red1 = _mm256_max_epi32(m_value, sh1);
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
+    auto red2 = _mm256_max_epi32(red1, sh2);
+
+    return std::max<element_type>(_mm256_extract_epi32(red2, 0),
+                                  _mm256_extract_epi32(red2, 4));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<int32_t>::min();
+    }
+    if (N == 1)
+    {
+      return get(0);
+    }
+
+    if (N == 2)
+    {
+      return std::max<element_type>(get(0), get(1));
+    }
+
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permutevar8x32_epi32(m_value, createPermute1(N));
+    auto red1 = _mm256_max_epi32(m_value, sh1);
+
+    if (N == 3)
+    {
+      return std::max<element_type>(_mm256_extract_epi32(red1, 0), get(2));
+    }
+    if (N == 4)
+    {
+      return std::max<element_type>(_mm256_extract_epi32(red1, 0),
+                                    _mm256_extract_epi32(red1, 2));
+    }
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
+    auto red2 = _mm256_max_epi32(red1, sh2);
+
+    return std::max<element_type>(_mm256_extract_epi32(red2, 0),
+                                  _mm256_extract_epi32(red2, 4));
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_max_epi32(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permutevar8x32_epi32(m_value, createPermute1(8));
+    auto red1 = _mm256_min_epi32(m_value, sh1);
+
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
+    auto red2 = _mm256_min_epi32(red1, sh2);
+
+    return std::min<element_type>(_mm256_extract_epi32(red2, 0),
+                                  _mm256_extract_epi32(red2, 4));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<int32_t>::max();
+    }
+    if (N == 1)
+    {
+      return get(0);
+    }
+
+    if (N == 2)
+    {
+      return std::min<element_type>(get(0), get(1));
+    }
+
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permutevar8x32_epi32(m_value, createPermute1(N));
+    auto red1 = _mm256_min_epi32(m_value, sh1);
+
+    if (N == 3)
+    {
+      return std::min<element_type>(_mm256_extract_epi32(red1, 0), get(2));
+    }
+    if (N == 4)
+    {
+      return std::min<element_type>(_mm256_extract_epi32(red1, 0),
+                                    _mm256_extract_epi32(red1, 2));
+    }
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
+    auto red2 = _mm256_min_epi32(red1, sh2);
+
+    return std::min<element_type>(_mm256_extract_epi32(red2, 0),
+                                  _mm256_extract_epi32(red2, 4));
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_min_epi32(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
index 00eea542cd..aa285f44e7 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
@@ -33,519 +33,533 @@ namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<int64_t, avx2_register> :
-    public internal::expt::RegisterBase<Register<int64_t, avx2_register>>
+template <>
+class Register<int64_t, avx2_register>
+    : public internal::expt::RegisterBase<Register<int64_t, avx2_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int64_t, avx2_register>>;
+
+  using register_policy = avx2_register;
+  using self_type       = Register<int64_t, avx2_register>;
+  using element_type    = int64_t;
+  using register_type   = __m256i;
+
+  using int_vector_type = Register<int64_t, avx2_register>;
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
+                             N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi64x(3 * stride, 2 * stride, stride, 0);
+  }
+
+  /*
+   * Use the packed-double permute function because there isn't one
+   * specifically for int64
+   *
+   * Just adds a bunch of casting, should be same cost
+   */
+  template <int perm>
+  RAJA_INLINE __m256i permute(__m256i x) const
+  {
+    return _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(x), perm));
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 4;
+
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : m_value(_mm256_setzero_si256()) {}
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0, element_type x1, element_type x2, element_type x3)
+      : m_value(_mm256_set_epi64x(x3, x2, x1, x0))
+  {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(c), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_epi64x(c)) {}
+
+
+  /*!
+   * @brief Returns underlying SIMD register.
+   */
+  RAJA_INLINE
+  constexpr register_type get_register() const { return m_value; }
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(ptr));
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    m_value = _mm256_castpd_si256(_mm256_maskload_pd(
+        reinterpret_cast<double const*>(ptr), createMask(N)));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(int64_t const* ptr, camp::idx_t stride)
+  {
+    m_value = _mm256_i64gather_epi64(reinterpret_cast<long long const*>(ptr),
+                                     createStridedOffsets(stride),
+                                     sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    m_value = _mm256_mask_i64gather_epi64(
+        _mm256_set1_epi64x(0), reinterpret_cast<long long const*>(ptr),
+        createStridedOffsets(stride), createMask(N), sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type& gather(element_type const* ptr, int_vector_type offsets)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int64_t, avx2_register>>;
-
-      using register_policy = avx2_register;
-      using self_type = Register<int64_t, avx2_register>;
-      using element_type = int64_t;
-      using register_type = __m256i;
-
-      using int_vector_type = Register<int64_t, avx2_register>;
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi64x(
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi64x(3*stride, 2*stride, stride, 0);
-      }
-
-      /*
-       * Use the packed-double permute function because there isn't one
-       * specifically for int64
-       *
-       * Just adds a bunch of casting, should be same cost
-       */
-      template<int perm>
-      RAJA_INLINE
-      __m256i permute(__m256i x) const {
-        return _mm256_castpd_si256(
-            _mm256_permute_pd(_mm256_castsi256_pd(x), perm));
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 4;
-
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : m_value(_mm256_setzero_si256()) {
-      }
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3) :
-        m_value(_mm256_set_epi64x(x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(c), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_epi64x(c)) {}
-
-
-      /*!
-       * @brief Returns underlying SIMD register.
-       */
-      RAJA_INLINE
-      constexpr
-      register_type get_register() const {
-        return m_value;
-      }
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const *>(ptr));
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_castpd_si256(
-            _mm256_maskload_pd(reinterpret_cast<double const *>(ptr), createMask(N))
-        );
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(int64_t const *ptr, camp::idx_t stride){
-        m_value = _mm256_i64gather_epi64(reinterpret_cast<long long const *>(ptr),
-                                      createStridedOffsets(stride),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_mask_i64gather_epi64(_mm256_set1_epi64x(0),
-                                      reinterpret_cast<long long const *>(ptr),
-                                      createStridedOffsets(stride),
-                                      createMask(N),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        m_value = _mm256_i64gather_epi64(reinterpret_cast<long long const *>(ptr),
-                                      offsets.get_register(),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
+    m_value =
+        _mm256_i64gather_epi64(reinterpret_cast<long long const*>(ptr),
+                               offsets.get_register(), sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  gather_n(element_type const* ptr, int_vector_type offsets, camp::idx_t N)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        m_value = _mm256_mask_i64gather_epi64(_mm256_setzero_si256(),
-                                      reinterpret_cast<long long const *>(ptr),
-                                      offsets.get_register(),
-                                      createMask(N),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_epi64(reinterpret_cast<long long*>(ptr), createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 4;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: return _mm256_extract_epi64(m_value, 0);
-          case 1: return _mm256_extract_epi64(m_value, 1);
-          case 2: return _mm256_extract_epi64(m_value, 2);
-          case 3: return _mm256_extract_epi64(m_value, 3);
-        }
-        return 0;
-      }
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: m_value = _mm256_insert_epi64(m_value, value, 0); break;
-          case 1: m_value = _mm256_insert_epi64(m_value, value, 1); break;
-          case 2: m_value = _mm256_insert_epi64(m_value, value, 2); break;
-          case 3: m_value = _mm256_insert_epi64(m_value, value, 3); break;
-        }
-
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_epi64x(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_epi64(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_epi64(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        // AVX2 does not supply an int64_t multiply, so do it manually
-        return self_type(_mm256_set_epi64x(
-            get(3)*b.get(3),
-            get(2)*b.get(2),
-            get(1)*b.get(1),
-            get(0)*b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi64x(
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi64x(
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-
-        // swap pairs and add
-        auto sh1 = permute<0x5>(m_value);
-        auto red1 = _mm256_add_epi64(m_value, sh1);
-
-        // add lower and upper
-        return _mm256_extract_epi64(red1, 0) + _mm256_extract_epi64(red1, 2);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        auto v1 = get(1);
-        red = red < v1 ? v1 : red;
-
-        auto v2 = get(2);
-        red = red < v2 ? v2 : red;
-
-        auto v3 = get(3);
-        red = red < v3 ? v3 : red;
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        if(N <= 0 || N > 4){
-          return RAJA::operators::limits<int64_t>::min();
-        }
-
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        if(N > 1){
-          auto v1 = get(1);
-          red = red < v1 ? v1 : red;
-        }
-        if(N > 2){
-          auto v2 = get(2);
-          red = red < v2 ? v2 : red;
-        }
-        if(N > 3){
-          auto v3 = get(3);
-          red = red < v3 ? v3 : red;
-        }
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-          return self_type(_mm256_set_epi64x(
-              get(3) > a.get(3) ? get(3) : a.get(3),
-              get(2) > a.get(2) ? get(2) : a.get(2),
-              get(1) > a.get(1) ? get(1) : a.get(1),
-              get(0) > a.get(0) ? get(0) : a.get(0) ));
-        
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        auto v1 = get(1);
-        red = red > v1 ? v1 : red;
-
-        auto v2 = get(2);
-        red = red > v2 ? v2 : red;
-
-        auto v3 = get(3);
-        red = red > v3 ? v3 : red;
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        if(N <= 0 || N > 4){
-          return RAJA::operators::limits<int64_t>::max();
-        }
-
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        if(N > 1){
-          auto v1 = get(1);
-          red = red > v1 ? v1 : red;
-        }
-        if(N > 2){
-          auto v2 = get(2);
-          red = red > v2 ? v2 : red;
-        }
-        if(N > 3){
-          auto v3 = get(3);
-          red = red > v3 ? v3 : red;
-        }
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-          return self_type(_mm256_set_epi64x(
-              get(3) < a.get(3) ? get(3) : a.get(3),
-              get(2) < a.get(2) ? get(2) : a.get(2),
-              get(1) < a.get(1) ? get(1) : a.get(1),
-              get(0) < a.get(0) ? get(0) : a.get(0) ));
-        
-      }
-  };
-
-
-}   // namespace expt
+    m_value = _mm256_mask_i64gather_epi64(
+        _mm256_setzero_si256(), reinterpret_cast<long long const*>(ptr),
+        offsets.get_register(), createMask(N), sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_epi64(reinterpret_cast<long long*>(ptr), createMask(N),
+                           m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 4; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+    case 0:
+      return _mm256_extract_epi64(m_value, 0);
+    case 1:
+      return _mm256_extract_epi64(m_value, 1);
+    case 2:
+      return _mm256_extract_epi64(m_value, 2);
+    case 3:
+      return _mm256_extract_epi64(m_value, 3);
+    }
+    return 0;
+  }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+    case 0:
+      m_value = _mm256_insert_epi64(m_value, value, 0);
+      break;
+    case 1:
+      m_value = _mm256_insert_epi64(m_value, value, 1);
+      break;
+    case 2:
+      m_value = _mm256_insert_epi64(m_value, value, 2);
+      break;
+    case 3:
+      m_value = _mm256_insert_epi64(m_value, value, 3);
+      break;
+    }
+
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm256_set1_epi64x(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm256_add_epi64(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm256_sub_epi64(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    // AVX2 does not supply an int64_t multiply, so do it manually
+    return self_type(_mm256_set_epi64x(get(3) * b.get(3), get(2) * b.get(2),
+                                       get(1) * b.get(1), get(0) * b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi64x(get(3) / b.get(3), get(2) / b.get(2),
+                                       get(1) / b.get(1), get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi64x(
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+
+    // swap pairs and add
+    auto sh1  = permute<0x5>(m_value);
+    auto red1 = _mm256_add_epi64(m_value, sh1);
+
+    // add lower and upper
+    return _mm256_extract_epi64(red1, 0) + _mm256_extract_epi64(red1, 2);
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
+  {
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    auto v1 = get(1);
+    red     = red < v1 ? v1 : red;
+
+    auto v2 = get(2);
+    red     = red < v2 ? v2 : red;
+
+    auto v3 = get(3);
+    red     = red < v3 ? v3 : red;
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    if (N <= 0 || N > 4)
+    {
+      return RAJA::operators::limits<int64_t>::min();
+    }
+
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    if (N > 1)
+    {
+      auto v1 = get(1);
+      red     = red < v1 ? v1 : red;
+    }
+    if (N > 2)
+    {
+      auto v2 = get(2);
+      red     = red < v2 ? v2 : red;
+    }
+    if (N > 3)
+    {
+      auto v3 = get(3);
+      red     = red < v3 ? v3 : red;
+    }
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_set_epi64x(get(3) > a.get(3) ? get(3) : a.get(3),
+                                       get(2) > a.get(2) ? get(2) : a.get(2),
+                                       get(1) > a.get(1) ? get(1) : a.get(1),
+                                       get(0) > a.get(0) ? get(0) : a.get(0)));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    auto v1 = get(1);
+    red     = red > v1 ? v1 : red;
+
+    auto v2 = get(2);
+    red     = red > v2 ? v2 : red;
+
+    auto v3 = get(3);
+    red     = red > v3 ? v3 : red;
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    if (N <= 0 || N > 4)
+    {
+      return RAJA::operators::limits<int64_t>::max();
+    }
+
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    if (N > 1)
+    {
+      auto v1 = get(1);
+      red     = red > v1 ? v1 : red;
+    }
+    if (N > 2)
+    {
+      auto v2 = get(2);
+      red     = red > v2 ? v2 : red;
+    }
+    if (N > 3)
+    {
+      auto v3 = get(3);
+      red     = red > v3 ? v3 : red;
+    }
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_set_epi64x(get(3) < a.get(3) ? get(3) : a.get(3),
+                                       get(2) < a.get(2) ? get(2) : a.get(2),
+                                       get(1) < a.get(1) ? get(1) : a.get(1),
+                                       get(0) < a.get(0) ? get(0) : a.get(0)));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/traits.hpp b/include/RAJA/policy/tensor/arch/avx2/traits.hpp
index e95c661335..d51b4ad853 100644
--- a/include/RAJA/policy/tensor/arch/avx2/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/traits.hpp
@@ -21,55 +21,60 @@
 #define RAJA_policy_tensor_arch_avx2_traits_HPP
 
 
-namespace RAJA {
-namespace internal {
-namespace expt {
-
-
-
-  template<>
-  struct RegisterTraits<RAJA::expt::avx2_register, int32_t>{
-      using element_type = int32_t;
-      using register_policy = RAJA::expt::avx2_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int32_t;
-  };
-
-  template<>
-  struct RegisterTraits<RAJA::expt::avx2_register, int64_t>{
-      using element_type = int64_t;
-      using register_policy = RAJA::expt::avx2_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 4;
-      using int_element_type = int64_t;
-  };
-
-  template<>
-  struct RegisterTraits<RAJA::expt::avx2_register, float>{
-      using element_type = float;
-      using register_policy = RAJA::expt::avx2_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int32_t;
-  };
-
-  template<>
-  struct RegisterTraits<RAJA::expt::avx2_register, double>{
-      using element_type = double;
-      using register_policy = RAJA::expt::avx2_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 4;
-      using int_element_type = int64_t;
-  };
-
-} // namespace intenral
-} // namespace expt
-} // namespace RAJA
-
-
-#endif // guard
-
-
-
-#endif // __AVX2__
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
+
+
+template <>
+struct RegisterTraits<RAJA::expt::avx2_register, int32_t>
+{
+  using element_type                      = int32_t;
+  using register_policy                   = RAJA::expt::avx2_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type                  = int32_t;
+};
+
+template <>
+struct RegisterTraits<RAJA::expt::avx2_register, int64_t>
+{
+  using element_type                      = int64_t;
+  using register_policy                   = RAJA::expt::avx2_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 4;
+  using int_element_type                  = int64_t;
+};
+
+template <>
+struct RegisterTraits<RAJA::expt::avx2_register, float>
+{
+  using element_type                      = float;
+  using register_policy                   = RAJA::expt::avx2_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type                  = int32_t;
+};
+
+template <>
+struct RegisterTraits<RAJA::expt::avx2_register, double>
+{
+  using element_type                      = double;
+  using register_policy                   = RAJA::expt::avx2_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 4;
+  using int_element_type                  = int64_t;
+};
+
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
+
+
+#endif  // guard
+
+
+#endif  // __AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx512.hpp b/include/RAJA/policy/tensor/arch/avx512.hpp
index 597563da35..71d0212c5e 100644
--- a/include/RAJA/policy/tensor/arch/avx512.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512.hpp
@@ -18,11 +18,11 @@
 // Check if the base AVX512 instructions are present
 #ifdef __AVX512F__
 
-#include<RAJA/policy/tensor/arch/avx512/traits.hpp>
-#include<RAJA/policy/tensor/arch/avx512/avx512_int32.hpp>
-#include<RAJA/policy/tensor/arch/avx512/avx512_int64.hpp>
-#include<RAJA/policy/tensor/arch/avx512/avx512_float.hpp>
-#include<RAJA/policy/tensor/arch/avx512/avx512_double.hpp>
+#include <RAJA/policy/tensor/arch/avx512/traits.hpp>
+#include <RAJA/policy/tensor/arch/avx512/avx512_int32.hpp>
+#include <RAJA/policy/tensor/arch/avx512/avx512_int64.hpp>
+#include <RAJA/policy/tensor/arch/avx512/avx512_float.hpp>
+#include <RAJA/policy/tensor/arch/avx512/avx512_double.hpp>
 
 
-#endif // __AVX512F__
+#endif  // __AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
index a7b7ebaafa..824311a400 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
@@ -34,360 +34,371 @@ namespace RAJA
 namespace expt
 {
 
-  template<>
-  class Register<double, avx512_register> :
-    public internal::expt::RegisterBase<Register<double, avx512_register>>
+template <>
+class Register<double, avx512_register>
+    : public internal::expt::RegisterBase<Register<double, avx512_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<double, avx512_register>>;
+
+
+  using register_policy = avx512_register;
+  using self_type       = Register<double, avx512_register>;
+  using element_type    = double;
+  using register_type   = __m512d;
+
+  using int_vector_type = Register<int64_t, avx512_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __mmask8 createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    switch (N)
+    {
+    case 0:
+      return __mmask8(0x00);
+    case 1:
+      return __mmask8(0x01);
+    case 2:
+      return __mmask8(0x03);
+    case 3:
+      return __mmask8(0x07);
+    case 4:
+      return __mmask8(0x0F);
+    case 5:
+      return __mmask8(0x1F);
+    case 6:
+      return __mmask8(0x3F);
+    case 7:
+      return __mmask8(0x7F);
+    case 8:
+      return __mmask8(0xFF);
+    }
+    return __mmask8(0);
+  }
+
+  RAJA_INLINE
+  __m512i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    auto vstride = _mm512_set1_epi64(stride);
+    auto vseq    = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+    return _mm512_mullo_epi64(vstride, vseq);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm512_setzero_pd()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register(element_type const& c) : base_type(), m_value(_mm512_set1_pd(c)) {}
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    // AVX512F
+    m_value = _mm512_loadu_pd(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    // AVX512F
+    m_value = _mm512_mask_loadu_pd(_mm512_setzero_pd(), createMask(N), ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    // AVX512F
+    m_value = _mm512_i64gather_pd(createStridedOffsets(stride), ptr,
+                                  sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    // AVX512F
+    m_value = _mm512_mask_i64gather_pd(_mm512_setzero_pd(), createMask(N),
+                                       createStridedOffsets(stride), ptr,
+                                       sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    // AVX512F
+    _mm512_storeu_pd(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_storeu_pd(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    // AVX512F
+    _mm512_i64scatter_pd(ptr, createStridedOffsets(stride), m_value,
+                         sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_i64scatter_pd(ptr, createMask(N), createStridedOffsets(stride),
+                              m_value, sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm512_set1_pd(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm512_add_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<double, avx512_register>>;
-
-
-      using register_policy = avx512_register;
-      using self_type = Register<double, avx512_register>;
-      using element_type = double;
-      using register_type = __m512d;
-
-      using int_vector_type = Register<int64_t, avx512_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __mmask8 createMask(camp::idx_t N) const {
-        // Generate a mask
-				switch(N){
-					case 0: return __mmask8(0x00);
-					case 1: return __mmask8(0x01);
-					case 2: return __mmask8(0x03);
-					case 3: return __mmask8(0x07);
-					case 4: return __mmask8(0x0F);
-					case 5: return __mmask8(0x1F);
-					case 6: return __mmask8(0x3F);
-					case 7: return __mmask8(0x7F);
-					case 8: return __mmask8(0xFF);
-				}
-				return __mmask8(0);
-      }
-
-      RAJA_INLINE
-      __m512i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-				auto vstride = _mm512_set1_epi64(stride);
-				auto vseq = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
-				return _mm512_mullo_epi64(vstride, vseq);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm512_setzero_pd()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register(element_type const &c) : base_type(), m_value(_mm512_set1_pd(c)) {}
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-			  // AVX512F
-        m_value = _mm512_loadu_pd(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-			  // AVX512F
-        m_value = _mm512_mask_loadu_pd(_mm512_setzero_pd(), createMask(N), ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-			  // AVX512F
-        m_value = _mm512_i64gather_pd(createStridedOffsets(stride),
-				                              ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-				// AVX512F
-        m_value = _mm512_mask_i64gather_pd(_mm512_setzero_pd(),
-                                      createMask(N),
-                                      createStridedOffsets(stride),
-                                      ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-				// AVX512F
-        _mm512_storeu_pd(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-				// AVX512F
-        _mm512_mask_storeu_pd(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-				// AVX512F
-				_mm512_i64scatter_pd(ptr, 
-				                     createStridedOffsets(stride),
-														 m_value,
-														 sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-				// AVX512F
-				_mm512_mask_i64scatter_pd(ptr, 
-                           				createMask(N),
-				                          createStridedOffsets(stride),
-																	m_value,
-														      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm512_set1_pd(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm512_add_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm512_sub_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm512_mul_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm512_div_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        return self_type(_mm512_maskz_div_pd(createMask(N), m_value, b.m_value));
-      }
+    return self_type(_mm512_sub_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm512_mul_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(_mm512_div_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    return self_type(_mm512_maskz_div_pd(createMask(N), m_value, b.m_value));
+  }
 
 // only use FMA's if the compiler has them turned on
 #ifdef __FMA__
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm512_fmadd_pd(m_value, b.m_value, c.m_value));
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm512_fmsub_pd(m_value, b.m_value, c.m_value));
-      }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm512_fmadd_pd(m_value, b.m_value, c.m_value));
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm512_fmsub_pd(m_value, b.m_value, c.m_value));
+  }
 #endif
 
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-				return _mm512_reduce_add_pd(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        return _mm512_reduce_max_pd(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_max_pd(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm512_max_pd(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        return _mm512_reduce_min_pd(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_min_pd(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm512_min_pd(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const { return _mm512_reduce_add_pd(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const { return _mm512_reduce_max_pd(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_max_pd(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm512_max_pd(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const { return _mm512_reduce_min_pd(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_min_pd(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm512_min_pd(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX512F__
+#endif  //__AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
index 84cb034a56..004fe3fffa 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
@@ -33,367 +33,387 @@ namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<float, avx512_register> :
-    public internal::expt::RegisterBase<Register<float, avx512_register>>
+template <>
+class Register<float, avx512_register>
+    : public internal::expt::RegisterBase<Register<float, avx512_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<float, avx512_register>>;
+
+  using register_policy = avx512_register;
+  using self_type       = Register<float, avx512_register>;
+  using element_type    = float;
+  using register_type   = __m512;
+
+  using int_vector_type = Register<int32_t, avx512_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __mmask16 createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    switch (N)
+    {
+    case 0:
+      return __mmask16(0x0000);
+    case 1:
+      return __mmask16(0x0001);
+    case 2:
+      return __mmask16(0x0003);
+    case 3:
+      return __mmask16(0x0007);
+    case 4:
+      return __mmask16(0x000F);
+    case 5:
+      return __mmask16(0x001F);
+    case 6:
+      return __mmask16(0x003F);
+    case 7:
+      return __mmask16(0x007F);
+    case 8:
+      return __mmask16(0x00FF);
+    case 9:
+      return __mmask16(0x01FF);
+    case 10:
+      return __mmask16(0x03FF);
+    case 11:
+      return __mmask16(0x07FF);
+    case 12:
+      return __mmask16(0x0FFF);
+    case 13:
+      return __mmask16(0x1FFF);
+    case 14:
+      return __mmask16(0x3FFF);
+    case 15:
+      return __mmask16(0x7FFF);
+    case 16:
+      return __mmask16(0xFFFF);
+    }
+    return __mmask16(0);
+  }
+
+  RAJA_INLINE
+  __m512i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    auto vstride = _mm512_set1_epi32(stride);
+    auto vseq =
+        _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    return _mm512_mullo_epi32(vstride, vseq);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 16;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm512_setzero_ps()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register(element_type const& c) : base_type(), m_value(_mm512_set1_ps(c)) {}
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    // AVX512F
+    m_value = _mm512_loadu_ps(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    // AVX512F
+    m_value = _mm512_mask_loadu_ps(_mm512_setzero_ps(), createMask(N), ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<float, avx512_register>>;
-
-      using register_policy = avx512_register;
-      using self_type = Register<float, avx512_register>;
-      using element_type = float;
-      using register_type = __m512;
-
-      using int_vector_type = Register<int32_t, avx512_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __mmask16 createMask(camp::idx_t N) const {
-        // Generate a mask
-				switch(N){
-					case 0:  return __mmask16(0x0000);
-					case 1:  return __mmask16(0x0001);
-					case 2:  return __mmask16(0x0003);
-					case 3:  return __mmask16(0x0007);
-					case 4:  return __mmask16(0x000F);
-					case 5:  return __mmask16(0x001F);
-					case 6:  return __mmask16(0x003F);
-					case 7:  return __mmask16(0x007F);
-					case 8:  return __mmask16(0x00FF);
-          case 9:  return __mmask16(0x01FF);
-          case 10: return __mmask16(0x03FF);
-          case 11: return __mmask16(0x07FF);
-          case 12: return __mmask16(0x0FFF);
-          case 13: return __mmask16(0x1FFF);
-          case 14: return __mmask16(0x3FFF);
-          case 15: return __mmask16(0x7FFF);
-          case 16: return __mmask16(0xFFFF);
-				}
-				return __mmask16(0);
-      }
-
-      RAJA_INLINE
-      __m512i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-				auto vstride = _mm512_set1_epi32(stride);
-				auto vseq = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-				return _mm512_mullo_epi32(vstride, vseq);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 16;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm512_setzero_ps()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register(element_type const &c) : base_type(), m_value(_mm512_set1_ps(c)) {}
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-			  // AVX512F
-        m_value = _mm512_loadu_ps(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-			  // AVX512F
-        m_value = _mm512_mask_loadu_ps(_mm512_setzero_ps(), createMask(N), ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-			  // AVX512F
-        m_value = _mm512_i32gather_ps(createStridedOffsets(stride),
-				                              ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-				// AVX512F
-        m_value = _mm512_mask_i32gather_ps(_mm512_setzero_ps(),
-                                      createMask(N),
-                                      createStridedOffsets(stride),
-                                      ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-				// AVX512F
-        _mm512_storeu_ps(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-				// AVX512F
-        _mm512_mask_storeu_ps(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-				// AVX512F
-				_mm512_i32scatter_ps(ptr,
-				                     createStridedOffsets(stride),
-														 m_value,
-														 sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-				// AVX512F
-				_mm512_mask_i32scatter_ps(ptr,
-                           				createMask(N),
-				                          createStridedOffsets(stride),
-																	m_value,
-														      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm512_set1_ps(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm512_add_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm512_sub_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm512_mul_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm512_div_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        return self_type(_mm512_maskz_div_ps(createMask(N), m_value, b.m_value));
-      }
+    // AVX512F
+    m_value = _mm512_i32gather_ps(createStridedOffsets(stride), ptr,
+                                  sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    // AVX512F
+    m_value = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), createMask(N),
+                                       createStridedOffsets(stride), ptr,
+                                       sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    // AVX512F
+    _mm512_storeu_ps(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_storeu_ps(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    // AVX512F
+    _mm512_i32scatter_ps(ptr, createStridedOffsets(stride), m_value,
+                         sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_i32scatter_ps(ptr, createMask(N), createStridedOffsets(stride),
+                              m_value, sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm512_set1_ps(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm512_add_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm512_sub_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm512_mul_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(_mm512_div_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    return self_type(_mm512_maskz_div_ps(createMask(N), m_value, b.m_value));
+  }
 
 // only use FMA's if the compiler has them turned on
 #ifdef __FMA__
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm512_fmadd_ps(m_value, b.m_value, c.m_value));
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm512_fmsub_ps(m_value, b.m_value, c.m_value));
-      }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm512_fmadd_ps(m_value, b.m_value, c.m_value));
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm512_fmsub_ps(m_value, b.m_value, c.m_value));
+  }
 #endif
 
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-				return _mm512_reduce_add_ps(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        return _mm512_reduce_max_ps(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_max_ps(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm512_max_ps(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        return _mm512_reduce_min_ps(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_min_ps(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm512_min_ps(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const { return _mm512_reduce_add_ps(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const { return _mm512_reduce_max_ps(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_max_ps(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm512_max_ps(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const { return _mm512_reduce_min_ps(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_min_ps(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm512_min_ps(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX512F__
+#endif  //__AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
index 021ca90fbe..e3ecac4520 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
@@ -33,419 +33,440 @@ namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<int32_t, avx512_register> :
-    public internal::expt::RegisterBase<Register<int32_t, avx512_register>>
+template <>
+class Register<int32_t, avx512_register>
+    : public internal::expt::RegisterBase<Register<int32_t, avx512_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int32_t, avx512_register>>;
+
+  using register_policy = avx512_register;
+  using self_type       = Register<int32_t, avx512_register>;
+  using element_type    = int32_t;
+  using register_type   = __m512i;
+
+  using int_vector_type = Register<int32_t, avx512_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __mmask16 createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    switch (N)
+    {
+    case 0:
+      return __mmask16(0x0000);
+    case 1:
+      return __mmask16(0x0001);
+    case 2:
+      return __mmask16(0x0003);
+    case 3:
+      return __mmask16(0x0007);
+    case 4:
+      return __mmask16(0x000F);
+    case 5:
+      return __mmask16(0x001F);
+    case 6:
+      return __mmask16(0x003F);
+    case 7:
+      return __mmask16(0x007F);
+    case 8:
+      return __mmask16(0x00FF);
+    case 9:
+      return __mmask16(0x01FF);
+    case 10:
+      return __mmask16(0x03FF);
+    case 11:
+      return __mmask16(0x07FF);
+    case 12:
+      return __mmask16(0x0FFF);
+    case 13:
+      return __mmask16(0x1FFF);
+    case 14:
+      return __mmask16(0x3FFF);
+    case 15:
+      return __mmask16(0x7FFF);
+    case 16:
+      return __mmask16(0xFFFF);
+    }
+    return __mmask16(0);
+  }
+
+  RAJA_INLINE
+  __m512i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    auto vstride = _mm512_set1_epi32(stride);
+    auto vseq =
+        _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    return _mm512_mullo_epi32(vstride, vseq);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 16;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm512_setzero_epi32()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register(element_type const& c) : base_type(), m_value(_mm512_set1_epi32(c))
+  {}
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    // AVX512F
+#if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 9))
+    m_value = _mm512_loadu_si512(ptr);
+#else
+    m_value = _mm512_loadu_epi32(ptr);  // GNU 7-9 are missing this instruction.
+#endif
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int32_t, avx512_register>>;
-
-      using register_policy = avx512_register;
-      using self_type = Register<int32_t, avx512_register>;
-      using element_type = int32_t;
-      using register_type = __m512i;
-
-      using int_vector_type = Register<int32_t, avx512_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __mmask16 createMask(camp::idx_t N) const {
-        // Generate a mask
-				switch(N){
-					case 0:  return __mmask16(0x0000);
-					case 1:  return __mmask16(0x0001);
-					case 2:  return __mmask16(0x0003);
-					case 3:  return __mmask16(0x0007);
-					case 4:  return __mmask16(0x000F);
-					case 5:  return __mmask16(0x001F);
-					case 6:  return __mmask16(0x003F);
-					case 7:  return __mmask16(0x007F);
-					case 8:  return __mmask16(0x00FF);
-          case 9:  return __mmask16(0x01FF);
-          case 10: return __mmask16(0x03FF);
-          case 11: return __mmask16(0x07FF);
-          case 12: return __mmask16(0x0FFF);
-          case 13: return __mmask16(0x1FFF);
-          case 14: return __mmask16(0x3FFF);
-          case 15: return __mmask16(0x7FFF);
-          case 16: return __mmask16(0xFFFF);
-				}
-				return __mmask16(0);
-      }
-
-      RAJA_INLINE
-      __m512i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-				auto vstride = _mm512_set1_epi32(stride);
-				auto vseq = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-				return _mm512_mullo_epi32(vstride, vseq);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 16;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm512_setzero_epi32()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register(element_type const &c) : base_type(), m_value(_mm512_set1_epi32(c)) {}
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-			  // AVX512F
-        #if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 9))
-        m_value = _mm512_loadu_si512(ptr);
-        #else
-        m_value = _mm512_loadu_epi32(ptr);  // GNU 7-9 are missing this instruction.
-        #endif
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-			  // AVX512F
-        m_value = _mm512_mask_loadu_epi32(_mm512_setzero_epi32(), createMask(N), ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-			  // AVX512F
-        m_value = _mm512_i32gather_epi32(createStridedOffsets(stride),
-				                              ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-				// AVX512F
-        m_value = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(),
-                                      createMask(N),
-                                      createStridedOffsets(stride),
-                                      ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-				// AVX512F
-        #if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 9))
-        _mm512_storeu_si512(ptr, m_value);
-        #else
-        _mm512_storeu_epi32(ptr, m_value);  // GNU 7-9 are missing this instruction.
-        #endif
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-				// AVX512F
-        _mm512_mask_storeu_epi32(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-				// AVX512F
-				_mm512_i32scatter_epi32(ptr,
-				                     createStridedOffsets(stride),
-														 m_value,
-														 sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-				// AVX512F
-				_mm512_mask_i32scatter_epi32(ptr,
-                           				createMask(N),
-				                          createStridedOffsets(stride),
-																	m_value,
-														      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {
-        // GNU 7-10 are missing this instruction.
-        #if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))
-        #define _mm512_cvtsi512_si32(x) _mm_cvtsi128_si32(_mm512_castsi512_si128(x))
-        #endif
-
-				switch(i){	
-					case 0: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 0));
-					case 1: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 1));
-					case 2: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 2));
-					case 3: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 3));
-					case 4: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 4));
-					case 5: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 5));
-					case 6: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 6));
-					case 7: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 7));
-					case 8: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 8));
-					case 9: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 9));
-					case 10: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 10));
-					case 11: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 11));
-					case 12: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 12));
-					case 13: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 13));
-					case 14: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 14));
-					case 15: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 15));
-				}
-				return 0;
-			}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-				m_value = _mm512_mask_set1_epi32(m_value, 1 << i, value);
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm512_set1_epi32(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm512_add_epi32(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm512_sub_epi32(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm512_mullo_epi32(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX512 does not supply an integer divide, so do it manually
-        return self_type(_mm512_set_epi32(
-            get(15)/b.get(15),
-            get(14)/b.get(14),
-            get(13)/b.get(13),
-            get(12)/b.get(12),
-            get(11)/b.get(11),
-            get(10)/b.get(10),
-            get(9)/b.get(9),
-            get(8)/b.get(8),
-            get(7)/b.get(7),
-            get(6)/b.get(6),
-            get(5)/b.get(5),
-            get(4)/b.get(4),
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX512 does not supply an integer divide, so do it manually
-        return self_type(_mm512_set_epi32(
-            N >= 16 ? get(15)/b.get(15) : 0,
-            N >= 15 ? get(14)/b.get(14) : 0,
-            N >= 14 ? get(13)/b.get(13) : 0,
-            N >= 13 ? get(12)/b.get(12) : 0,
-            N >= 12 ? get(11)/b.get(11) : 0,
-            N >= 11 ? get(10)/b.get(10) : 0,
-            N >= 10 ? get(9)/b.get(9) : 0,
-            N >= 9 ? get(8)/b.get(8) : 0,
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-				return _mm512_reduce_add_epi32(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        return _mm512_reduce_max_epi32(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_max_epi32(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm512_max_epi32(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        return _mm512_reduce_min_epi32(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_min_epi32(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm512_min_epi32(m_value, a.m_value));
-      }
-  };
-
-}   // namespace expt
+    // AVX512F
+    m_value =
+        _mm512_mask_loadu_epi32(_mm512_setzero_epi32(), createMask(N), ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    // AVX512F
+    m_value = _mm512_i32gather_epi32(createStridedOffsets(stride), ptr,
+                                     sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    // AVX512F
+    m_value = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), createMask(N),
+                                          createStridedOffsets(stride), ptr,
+                                          sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    // AVX512F
+#if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 9))
+    _mm512_storeu_si512(ptr, m_value);
+#else
+    _mm512_storeu_epi32(ptr, m_value);  // GNU 7-9 are missing this instruction.
+#endif
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_storeu_epi32(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    // AVX512F
+    _mm512_i32scatter_epi32(ptr, createStridedOffsets(stride), m_value,
+                            sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_i32scatter_epi32(ptr, createMask(N),
+                                 createStridedOffsets(stride), m_value,
+                                 sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const
+  {
+// GNU 7-10 are missing this instruction.
+#if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))
+#define _mm512_cvtsi512_si32(x) _mm_cvtsi128_si32(_mm512_castsi512_si128(x))
+#endif
+
+    switch (i)
+    {
+    case 0:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 0));
+    case 1:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 1));
+    case 2:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 2));
+    case 3:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 3));
+    case 4:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 4));
+    case 5:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 5));
+    case 6:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 6));
+    case 7:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 7));
+    case 8:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 8));
+    case 9:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 9));
+    case 10:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 10));
+    case 11:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 11));
+    case 12:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 12));
+    case 13:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 13));
+    case 14:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 14));
+    case 15:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 15));
+    }
+    return 0;
+  }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value = _mm512_mask_set1_epi32(m_value, 1 << i, value);
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm512_set1_epi32(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm512_add_epi32(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm512_sub_epi32(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm512_mullo_epi32(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    // AVX512 does not supply an integer divide, so do it manually
+    return self_type(_mm512_set_epi32(
+        get(15) / b.get(15), get(14) / b.get(14), get(13) / b.get(13),
+        get(12) / b.get(12), get(11) / b.get(11), get(10) / b.get(10),
+        get(9) / b.get(9), get(8) / b.get(8), get(7) / b.get(7),
+        get(6) / b.get(6), get(5) / b.get(5), get(4) / b.get(4),
+        get(3) / b.get(3), get(2) / b.get(2), get(1) / b.get(1),
+        get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX512 does not supply an integer divide, so do it manually
+    return self_type(_mm512_set_epi32(
+        N >= 16 ? get(15) / b.get(15) : 0, N >= 15 ? get(14) / b.get(14) : 0,
+        N >= 14 ? get(13) / b.get(13) : 0, N >= 13 ? get(12) / b.get(12) : 0,
+        N >= 12 ? get(11) / b.get(11) : 0, N >= 11 ? get(10) / b.get(10) : 0,
+        N >= 10 ? get(9) / b.get(9) : 0, N >= 9 ? get(8) / b.get(8) : 0,
+        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
+        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const { return _mm512_reduce_add_epi32(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const { return _mm512_reduce_max_epi32(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_max_epi32(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm512_max_epi32(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const { return _mm512_reduce_min_epi32(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_min_epi32(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm512_min_epi32(m_value, a.m_value));
+  }
+};
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX512F__
+#endif  //__AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
index 17f929c607..b99c1a09ab 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
@@ -33,373 +33,386 @@ namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<int64_t, avx512_register> :
-    public internal::expt::RegisterBase<Register<int64_t, avx512_register>>
+template <>
+class Register<int64_t, avx512_register>
+    : public internal::expt::RegisterBase<Register<int64_t, avx512_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int64_t, avx512_register>>;
+
+  using register_policy = avx512_register;
+  using self_type       = Register<int64_t, avx512_register>;
+  using element_type    = int64_t;
+  using register_type   = __m512i;
+
+  using int_vector_type = Register<int64_t, avx512_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __mmask8 createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    switch (N)
+    {
+    case 0:
+      return __mmask8(0x00);
+    case 1:
+      return __mmask8(0x01);
+    case 2:
+      return __mmask8(0x03);
+    case 3:
+      return __mmask8(0x07);
+    case 4:
+      return __mmask8(0x0F);
+    case 5:
+      return __mmask8(0x1F);
+    case 6:
+      return __mmask8(0x3F);
+    case 7:
+      return __mmask8(0x7F);
+    case 8:
+      return __mmask8(0xFF);
+    }
+    return __mmask8(0);
+  }
+
+  RAJA_INLINE
+  __m512i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    auto vstride = _mm512_set1_epi64(stride);
+    auto vseq    = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+    return _mm512_mullo_epi64(vstride, vseq);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm512_setzero_epi32()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register(element_type const& c) : base_type(), m_value(_mm512_set1_epi64(c))
+  {}
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    // AVX512F
+#if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) ||            \
+    (!defined(SYCL_LANGUAGE_VERSION) &&                                        \
+     defined(__INTEL_LLVM_COMPILER))  // Check for oneapi's icpx.
+    m_value = _mm512_maskz_loadu_epi64(
+        ~0,
+        ptr);  // May cause slowdown due to looping over 8 bytes, one at a time.
+#else
+    m_value =
+        _mm512_loadu_epi64(ptr);  // GNU 7-10 are missing this instruction, as
+                                  // is icpx as of version 2022.2.
+#endif
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    // AVX512F
+    m_value =
+        _mm512_mask_loadu_epi64(_mm512_setzero_epi32(), createMask(N), ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    // AVX512F
+    m_value = _mm512_i64gather_epi64(createStridedOffsets(stride), ptr,
+                                     sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    // AVX512F
+    m_value = _mm512_mask_i64gather_epi64(_mm512_setzero_epi32(), createMask(N),
+                                          createStridedOffsets(stride), ptr,
+                                          sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    // AVX512F
+#if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) ||            \
+    (!defined(SYCL_LANGUAGE_VERSION) &&                                        \
+     defined(__INTEL_LLVM_COMPILER))  // Check for oneapi's icpx.
+    _mm512_mask_storeu_epi64(ptr, ~0,
+                             m_value);  // May cause slowdown due to looping
+                                        // over 8 bytes, one at a time.
+#else
+    _mm512_storeu_epi64(ptr,
+                        m_value);  // GNU 7-10 are missing this instruction, as
+                                   // is icpx as of version 2022.2.
+#endif
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_storeu_epi64(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    // AVX512F
+    _mm512_i64scatter_epi64(ptr, createStridedOffsets(stride), m_value,
+                            sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int64_t, avx512_register>>;
-
-      using register_policy = avx512_register;
-      using self_type = Register<int64_t, avx512_register>;
-      using element_type = int64_t;
-      using register_type = __m512i;
-
-      using int_vector_type = Register<int64_t, avx512_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __mmask8 createMask(camp::idx_t N) const {
-        // Generate a mask
-				switch(N){
-					case 0: return __mmask8(0x00);
-					case 1: return __mmask8(0x01);
-					case 2: return __mmask8(0x03);
-					case 3: return __mmask8(0x07);
-					case 4: return __mmask8(0x0F);
-					case 5: return __mmask8(0x1F);
-					case 6: return __mmask8(0x3F);
-					case 7: return __mmask8(0x7F);
-					case 8: return __mmask8(0xFF);
-				}
-				return __mmask8(0);
-      }
-
-      RAJA_INLINE
-      __m512i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-				auto vstride = _mm512_set1_epi64(stride);
-				auto vseq = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
-				return _mm512_mullo_epi64(vstride, vseq);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm512_setzero_epi32()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register(element_type const &c) : base_type(), m_value(_mm512_set1_epi64(c)) {}
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-			  // AVX512F
-        #if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) || \
-            (!defined(SYCL_LANGUAGE_VERSION) && defined(__INTEL_LLVM_COMPILER))  // Check for oneapi's icpx.
-        m_value = _mm512_maskz_loadu_epi64(~0, ptr);  // May cause slowdown due to looping over 8 bytes, one at a time.
-        #else
-        m_value = _mm512_loadu_epi64(ptr);  // GNU 7-10 are missing this instruction, as is icpx as of version 2022.2.
-        #endif
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-			  // AVX512F
-        m_value = _mm512_mask_loadu_epi64(_mm512_setzero_epi32(), createMask(N), ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-			  // AVX512F
-        m_value = _mm512_i64gather_epi64(createStridedOffsets(stride),
-				                              ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-				// AVX512F
-        m_value = _mm512_mask_i64gather_epi64(_mm512_setzero_epi32(),
-                                      createMask(N),
-                                      createStridedOffsets(stride),
-                                      ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-				// AVX512F
-        #if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) || \
-            (!defined(SYCL_LANGUAGE_VERSION) && defined(__INTEL_LLVM_COMPILER))  // Check for oneapi's icpx.
-        _mm512_mask_storeu_epi64(ptr, ~0, m_value);  // May cause slowdown due to looping over 8 bytes, one at a time.
-        #else
-        _mm512_storeu_epi64(ptr, m_value);  // GNU 7-10 are missing this instruction, as is icpx as of version 2022.2.
-        #endif
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-				// AVX512F
-        _mm512_mask_storeu_epi64(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-				// AVX512F
-				_mm512_i64scatter_epi64(ptr,
-				                     createStridedOffsets(stride),
-														 m_value,
-														 sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-				// AVX512F
-				_mm512_mask_i64scatter_epi64(ptr,
-                           				createMask(N),
-				                          createStridedOffsets(stride),
-																	m_value,
-														      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm512_set1_epi64(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm512_add_epi64(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm512_sub_epi64(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm512_mullo_epi64(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX512 does not supply an integer divide, so do it manually
-        return self_type(_mm512_set_epi64(
-            get(7)/b.get(7),
-            get(6)/b.get(6),
-            get(5)/b.get(5),
-            get(4)/b.get(4),
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N ) const {
-        // AVX512 does not supply an integer divide, so do it manually
-        return self_type(_mm512_set_epi64(
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-				return _mm512_reduce_add_epi64(m_value);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        return _mm512_reduce_max_epi64(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_max_epi64(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm512_max_epi64(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        return _mm512_reduce_min_epi64(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_min_epi64(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm512_min_epi64(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+    // AVX512F
+    _mm512_mask_i64scatter_epi64(ptr, createMask(N),
+                                 createStridedOffsets(stride), m_value,
+                                 sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm512_set1_epi64(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm512_add_epi64(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm512_sub_epi64(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm512_mullo_epi64(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    // AVX512 does not supply an integer divide, so do it manually
+    return self_type(_mm512_set_epi64(get(7) / b.get(7), get(6) / b.get(6),
+                                      get(5) / b.get(5), get(4) / b.get(4),
+                                      get(3) / b.get(3), get(2) / b.get(2),
+                                      get(1) / b.get(1), get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX512 does not supply an integer divide, so do it manually
+    return self_type(_mm512_set_epi64(
+        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
+        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const { return _mm512_reduce_add_epi64(m_value); }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const { return _mm512_reduce_max_epi64(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_max_epi64(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm512_max_epi64(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const { return _mm512_reduce_min_epi64(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_min_epi64(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm512_min_epi64(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX512F__
+#endif  //__AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/traits.hpp b/include/RAJA/policy/tensor/arch/avx512/traits.hpp
index b2b5cf6731..3088b0b8ae 100644
--- a/include/RAJA/policy/tensor/arch/avx512/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/traits.hpp
@@ -21,53 +21,59 @@
 #ifndef RAJA_policy_tensor_arch_avx512_traits_HPP
 #define RAJA_policy_tensor_arch_avx512_traits_HPP
 
-namespace RAJA {
-namespace internal {
-namespace expt {
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
 
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx512_register, int32_t>{
-      using element_type = int32_t;
-      using register_policy = RAJA::expt::avx512_register;
-      static constexpr camp::idx_t s_num_bits = 512;
-      static constexpr camp::idx_t s_num_elem = 16;
-      using int_element_type = int32_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx512_register, int32_t>
+{
+  using element_type                      = int32_t;
+  using register_policy                   = RAJA::expt::avx512_register;
+  static constexpr camp::idx_t s_num_bits = 512;
+  static constexpr camp::idx_t s_num_elem = 16;
+  using int_element_type                  = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx512_register, int64_t>{
-      using element_type = int64_t;
-      using register_policy = RAJA::expt::avx512_register;
-      static constexpr camp::idx_t s_num_bits = 512;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int64_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx512_register, int64_t>
+{
+  using element_type                      = int64_t;
+  using register_policy                   = RAJA::expt::avx512_register;
+  static constexpr camp::idx_t s_num_bits = 512;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type                  = int64_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx512_register, float>{
-      using element_type = float;
-      using register_policy = RAJA::expt::avx512_register;
-      static constexpr camp::idx_t s_num_bits = 512;
-      static constexpr camp::idx_t s_num_elem = 16;
-      using int_element_type = int32_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx512_register, float>
+{
+  using element_type                      = float;
+  using register_policy                   = RAJA::expt::avx512_register;
+  static constexpr camp::idx_t s_num_bits = 512;
+  static constexpr camp::idx_t s_num_elem = 16;
+  using int_element_type                  = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx512_register, double>{
-      using element_type = double;
-      using register_policy = RAJA::expt::avx512_register;
-      static constexpr camp::idx_t s_num_bits = 512;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int64_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx512_register, double>
+{
+  using element_type                      = double;
+  using register_policy                   = RAJA::expt::avx512_register;
+  static constexpr camp::idx_t s_num_bits = 512;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type                  = int64_t;
+};
 
-} // namespace internal
-} // namespace expt
-} // namespace RAJA
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
-#endif // guard
+#endif  // guard
 
 
-
-#endif // __AVX512F__
+#endif  // __AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/cuda.hpp b/include/RAJA/policy/tensor/arch/cuda.hpp
index a840c63d85..cfda807e68 100644
--- a/include/RAJA/policy/tensor/arch/cuda.hpp
+++ b/include/RAJA/policy/tensor/arch/cuda.hpp
@@ -21,11 +21,11 @@
 #ifndef RAJA_policy_tensor_arch_cuda_HPP
 #define RAJA_policy_tensor_arch_cuda_HPP
 
-#include<RAJA/policy/tensor/arch/cuda/traits.hpp>
-#include<RAJA/policy/tensor/arch/cuda/cuda_warp.hpp>
+#include <RAJA/policy/tensor/arch/cuda/traits.hpp>
+#include <RAJA/policy/tensor/arch/cuda/cuda_warp.hpp>
 
 
 #endif
 
 
-#endif // RAJA_ENABLE_CUDA
+#endif  // RAJA_ENABLE_CUDA
diff --git a/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp b/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp
index e23eb92bed..81b19709ab 100644
--- a/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp
+++ b/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp
@@ -30,981 +30,1019 @@
 #define RAJA_policy_tensor_arch_cuda_cuda_warp_register_HPP
 
 
-
 namespace RAJA
 {
 namespace expt
 {
 
-  template<typename ELEMENT_TYPE>
-  class Register<ELEMENT_TYPE, cuda_warp_register> :
-    public internal::expt::RegisterBase<Register<ELEMENT_TYPE, cuda_warp_register>>
-  {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<ELEMENT_TYPE, cuda_warp_register>>;
-
-      using register_policy = cuda_warp_register;
-      using self_type = Register<ELEMENT_TYPE, cuda_warp_register>;
-      using element_type = ELEMENT_TYPE;
-      using register_type = ELEMENT_TYPE;
-
-      using int_vector_type = Register<int64_t, cuda_warp_register>;
-
-
-		private:
-      element_type m_value;
+template <typename ELEMENT_TYPE>
+class Register<ELEMENT_TYPE, cuda_warp_register>
+    : public internal::expt::RegisterBase<
+          Register<ELEMENT_TYPE, cuda_warp_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<ELEMENT_TYPE, cuda_warp_register>>;
+
+  using register_policy = cuda_warp_register;
+  using self_type       = Register<ELEMENT_TYPE, cuda_warp_register>;
+  using element_type    = ELEMENT_TYPE;
+  using register_type   = ELEMENT_TYPE;
+
+  using int_vector_type = Register<int64_t, cuda_warp_register>;
+
+
+private:
+  element_type m_value;
+
+public:
+  static constexpr int s_num_elem = 32;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr Register() : base_type(), m_value(0) {}
+
+
+  /*!
+   * @brief Copy constructor from raw value
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr Register(element_type c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+
+  /*!
+   * @brief Copy assignment operator
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
 
-		public:
-
-      static constexpr int s_num_elem = 32;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register() : base_type(), m_value(0) {
-
-      }
-
-
-      /*!
-       * @brief Copy constructor from raw value
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register(element_type c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-
-      /*!
-       * @brief Copy assignment operator
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &operator=(element_type c){
-        m_value = c;
-        return *this;
-      }
-
-      /*!
-       * @brief Gets our warp lane
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      static
-      int get_lane() {
-        return threadIdx.x;
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      constexpr
-      element_type const &get_raw_value() const {
-        return m_value;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      element_type &get_raw_value() {
-        return m_value;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_root() {
-        return get_lane() == 0;
-      }
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_packed(element_type const *ptr){
-
-        auto lane = get_lane();
-
-        m_value = ptr[lane];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_packed_n(element_type const *ptr, int N){
-        auto lane = get_lane();
-        if(lane < N){
-          m_value = ptr[lane];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_strided(element_type const *ptr, int stride){
-
-        auto lane = get_lane();
-
-        m_value = ptr[stride*lane];
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_strided_n(element_type const *ptr, int stride, int N){
-        auto lane = get_lane();
-
-        if(lane < N){
-          m_value = ptr[stride*lane];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
-
-        m_value = ptr[offsets.get_raw_value()];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
-        if(get_lane() < N){
-          m_value = ptr[offsets.get_raw_value()];
-        }
-        else{
-          m_value = element_type(0);
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays.
-       *
-       * The default operation combines the s_segmented_offsets and gather
-       * operations.
-       *
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load(element_type const *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer){
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        m_value = ptr[seg*stride_outer + i*stride_inner];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays where we load partial segments.
-       *
-       *
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load_nm(element_type const *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer)
-      {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          m_value = element_type(0);
-        }
-        else{
-          m_value = ptr[seg*stride_outer + i*stride_inner];
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_packed(element_type *ptr) const{
-
-        auto lane = get_lane();
-
-        ptr[lane] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_packed_n(element_type *ptr, int N) const{
-
-        auto lane = get_lane();
-
-        if(lane < N){
-          ptr[lane] = m_value;
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_strided(element_type *ptr, int stride) const{
-
-        auto lane = get_lane();
-
-        ptr[lane*stride] = m_value;
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_strided_n(element_type *ptr, int stride, int N) const{
-
-        auto lane = get_lane();
-
-        if(lane < N){
-          ptr[lane*stride] = m_value;
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic scatter operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &scatter(element_type *ptr, T2 const &offsets) const {
-
-        ptr[offsets.get_raw_value()] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic scatter operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &scatter_n(element_type *ptr, T2 const &offsets, camp::idx_t N) const {
-        if(get_lane() < N){
-          ptr[offsets.get_raw_value()] = m_value;
-        }
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented store operation used for storing sub-matrices
-       * to larger arrays.
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store(element_type *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer) const {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        ptr[seg*stride_outer + i*stride_inner] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented store operation used for storing sub-matrices
-       * to larger arrays where we store partial segments.
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store_nm(element_type *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer) const
-      {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          // nop
-        }
-        else{
-          ptr[seg*stride_outer + i*stride_inner] = m_value;
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      constexpr
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type get(int i) const
-			{
-        return  __shfl_sync(0xffffffff, m_value, i, 32);
-			}
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &set(element_type value, int i)
-			{
-				auto lane = get_lane();
-      	if(lane == i){
-					m_value = value;
-				}
-        return *this;
-			}
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &a){
-        m_value = a;
-        return *this;
-      }
-
-      /*!
-       * @brief Extracts a scalar value and broadcasts to a new register
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type get_and_broadcast(int i) const {
-        self_type x;
-        x.m_value = __shfl_sync(0xffffffff, m_value, i, 32);
-        return x;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(m_value + b.m_value);
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(m_value - b.m_value);
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(m_value * b.m_value);
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(m_value / b.m_value);
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, int N) const {
-        return get_lane() < N ? self_type(m_value / b.m_value) : self_type(element_type(0));
-      }
-
-      /**
-       * floats and doubles use the CUDA instrinsic FMA
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& operator=(element_type c)
+  {
+    m_value = c;
+    return *this;
+  }
+
+  /*!
+   * @brief Gets our warp lane
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr static int get_lane() { return threadIdx.x; }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  constexpr element_type const& get_raw_value() const { return m_value; }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  element_type& get_raw_value() { return m_value; }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  static constexpr bool is_root() { return get_lane() == 0; }
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& load_packed(element_type const* ptr)
+  {
+
+    auto lane = get_lane();
+
+    m_value = ptr[lane];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& load_packed_n(element_type const* ptr, int N)
+  {
+    auto lane = get_lane();
+    if (lane < N)
+    {
+      m_value = ptr[lane];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& load_strided(element_type const* ptr, int stride)
+  {
+
+    auto lane = get_lane();
+
+    m_value = ptr[stride * lane];
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& load_strided_n(element_type const* ptr, int stride, int N)
+  {
+    auto lane = get_lane();
+
+    if (lane < N)
+    {
+      m_value = ptr[stride * lane];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& gather(element_type const* ptr, int_vector_type offsets)
+  {
+
+    m_value = ptr[offsets.get_raw_value()];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type&
+  gather_n(element_type const* ptr, int_vector_type offsets, camp::idx_t N)
+  {
+    if (get_lane() < N)
+    {
+      m_value = ptr[offsets.get_raw_value()];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays.
+   *
+   * The default operation combines the s_segmented_offsets and gather
+   * operations.
+   *
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type& segmented_load(element_type const* ptr,
+                            camp::idx_t segbits,
+                            camp::idx_t stride_inner,
+                            camp::idx_t stride_outer)
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    m_value = ptr[seg * stride_outer + i * stride_inner];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays where we load partial segments.
+   *
+   *
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type& segmented_load_nm(element_type const* ptr,
+                               camp::idx_t segbits,
+                               camp::idx_t stride_inner,
+                               camp::idx_t stride_outer,
+                               camp::idx_t num_inner,
+                               camp::idx_t num_outer)
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    if (seg >= num_outer || i >= num_inner)
+    {
+      m_value = element_type(0);
+    }
+    else
+    {
+      m_value = ptr[seg * stride_outer + i * stride_inner];
+    }
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const& store_packed(element_type* ptr) const
+  {
+
+    auto lane = get_lane();
+
+    ptr[lane] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const& store_packed_n(element_type* ptr, int N) const
+  {
+
+    auto lane = get_lane();
+
+    if (lane < N)
+    {
+      ptr[lane] = m_value;
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const& store_strided(element_type* ptr, int stride) const
+  {
+
+    auto lane = get_lane();
+
+    ptr[lane * stride] = m_value;
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const& store_strided_n(element_type* ptr, int stride, int N) const
+  {
+
+    auto lane = get_lane();
+
+    if (lane < N)
+    {
+      ptr[lane * stride] = m_value;
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic scatter operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_DEVICE RAJA_INLINE self_type const& scatter(element_type* ptr,
+                                                   T2 const& offsets) const
+  {
+
+    ptr[offsets.get_raw_value()] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic scatter operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_DEVICE RAJA_INLINE self_type const&
+  scatter_n(element_type* ptr, T2 const& offsets, camp::idx_t N) const
+  {
+    if (get_lane() < N)
+    {
+      ptr[offsets.get_raw_value()] = m_value;
+    }
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented store operation used for storing sub-matrices
+   * to larger arrays.
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type const& segmented_store(element_type* ptr,
+                                   camp::idx_t segbits,
+                                   camp::idx_t stride_inner,
+                                   camp::idx_t stride_outer) const
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    ptr[seg * stride_outer + i * stride_inner] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented store operation used for storing sub-matrices
+   * to larger arrays where we store partial segments.
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type const& segmented_store_nm(element_type* ptr,
+                                      camp::idx_t segbits,
+                                      camp::idx_t stride_inner,
+                                      camp::idx_t stride_outer,
+                                      camp::idx_t num_inner,
+                                      camp::idx_t num_outer) const
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    if (seg >= num_outer || i >= num_inner)
+    {
+      // nop
+    }
+    else
+    {
+      ptr[seg * stride_outer + i * stride_inner] = m_value;
+    }
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  constexpr RAJA_INLINE RAJA_DEVICE element_type get(int i) const
+  {
+    return __shfl_sync(0xffffffff, m_value, i, 32);
+  }
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& set(element_type value, int i)
+  {
+    auto lane = get_lane();
+    if (lane == i)
+    {
+      m_value = value;
+    }
+    return *this;
+  }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& a)
+  {
+    m_value = a;
+    return *this;
+  }
+
+  /*!
+   * @brief Extracts a scalar value and broadcasts to a new register
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type get_and_broadcast(int i) const
+  {
+    self_type x;
+    x.m_value = __shfl_sync(0xffffffff, m_value, i, 32);
+    return x;
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(m_value + b.m_value);
+  }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(m_value - b.m_value);
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(m_value * b.m_value);
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(m_value / b.m_value);
+  }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, int N) const
+  {
+    return get_lane() < N ? self_type(m_value / b.m_value)
+                          : self_type(element_type(0));
+  }
+
+  /**
+   * floats and doubles use the CUDA instrinsic FMA
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(fma(m_value, b.m_value, c.m_value));
-      }
-
-      /**
-       * int32 and int64 don't have a CUDA intrinsic FMA, do unfused ops
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+                              RETURN_TYPE>::type
+      multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(fma(m_value, b.m_value, c.m_value));
+  }
+
+  /**
+   * int32 and int64 don't have a CUDA intrinsic FMA, do unfused ops
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(m_value * b.m_value + c.m_value);
-      }
-
-      /**
-       * floats and doubles use the CUDA instrinsic FMS
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+                              RETURN_TYPE>::type
+      multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(m_value * b.m_value + c.m_value);
+  }
+
+  /**
+   * floats and doubles use the CUDA instrinsic FMS
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(fma(m_value, b.m_value, -c.m_value));
-      }
-
-      /**
-       * int32 and int64 don't have a CUDA intrinsic FMS, do unfused ops
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+                              RETURN_TYPE>::type
+      multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(fma(m_value, b.m_value, -c.m_value));
+  }
+
+  /**
+   * int32 and int64 don't have a CUDA intrinsic FMS, do unfused ops
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(m_value * b.m_value - c.m_value);
-      }
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type sum() const
-      {
-				// Allreduce sum
-				using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::plus>;
-
-				return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
-      }
-
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type max() const
-      {
-        // Allreduce maximum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::maximum>;
-
-        return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type max_n(int N) const
-      {
-        // Allreduce maximum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::maximum>;
-
-        auto ident = RAJA::operators::limits<element_type>::min();
-        auto lane = get_lane();
-        auto value = lane < N ? m_value : ident;
-        return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type vmax(self_type a) const
-      {
-        return self_type{RAJA::max<element_type>(m_value, a.m_value)};
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type min() const
-      {
-        // Allreduce minimum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::minimum>;
-
-        return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
-
-      }
-
-      /*!
-       * @brief Returns the largest element from first N lanes
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type min_n(int N) const
-      {
-        // Allreduce minimum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::minimum>;
-
-        auto ident = RAJA::operators::limits<element_type>::max();
-        auto lane = get_lane();
-        auto value = lane < N ? m_value : ident;
-        return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type vmin(self_type a) const
-      {
-        return self_type{RAJA::min<element_type>(m_value, a.m_value)};
-      }
-
-
-
-
-      /*!
-       * Provides gather/scatter indices for segmented loads and stores
-       *
-       * THe number of segment bits (segbits) is specified, as well as the
-       * stride between elements in a segment (stride_inner),
-       * and the stride between segments (stride_outer)
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      static
-      int_vector_type s_segmented_offsets(camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer)
-      {
-        int_vector_type result;
-
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        result.get_raw_value() = seg*stride_outer + i*stride_inner;
-
-        return result;
-      }
-
-
-      /*!
-       * Sum elements within each segment, with segment size defined by segbits.
-       * Stores each segments sum consecutively, but shifed to the
-       * corresponding output_segment slot.
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 is equivalent to the input vector,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=1 sums neighboring pairs of values.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
-       *
-       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-       *      output_segment denotes the vector position of the sum
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_sum_inner(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-
-        // First: tree reduce values within each segment
-        element_type x = m_value;
-        RAJA_UNROLL
-        for(int delta = 1;delta < 1<<segbits;delta = delta<<1){
-
-          // tree shuffle
-          element_type y = __shfl_sync(0xffffffff, x, get_lane()+delta);
-
-          // reduce
-          x += y;
-        }
-
-        // Second: send result to output segment lanes
-        self_type result;
-        result.get_raw_value() = __shfl_sync(0xffffffff, x, get_lane()<<segbits);
-
-        // Third: mask off everything but output_segment
-        //        this is because all output segments are valid at this point
-        // (5-segbits), the 5 is since the warp-width is 32 == 1<<5
-        int our_output_segment = get_lane()>>(5-segbits);
-        bool in_output_segment = our_output_segment == output_segment;
-        if(!in_output_segment){
-          result.get_raw_value() = 0;
-        }
-
-        return result;
-      }
-
-      /*!
-       * Sum across segments, with segment size defined by segbits
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 is equivalent to the input vector,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=1 sums strided pairs of values.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0+x4, x1+x5, x2+x6, x3+x7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0+x4, x1+x5, x2+x6, x3+x7
-       *
-       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-       *      output_segment denotes the vector position of the sum
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_sum_outer(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-
-        // First: tree reduce values within each segment
-        element_type x = m_value;
-        RAJA_UNROLL
-        for(int i = 0;i < 5-segbits; ++ i){
-
-          // tree shuffle
-          int delta = s_num_elem >> (i+1);
-          element_type y = __shfl_sync(0xffffffff, x, get_lane()+delta);
-
-          // reduce
-          x += y;
-        }
-
-        // Second: send result to output segment lanes
-        self_type result;
-        int get_from = get_lane()&( (1<<segbits)-1);
-        result.get_raw_value() = __shfl_sync(0xffffffff, x, get_from);
-
-        int mask = (get_lane()>>segbits) == output_segment;
-
-
-        // Third: mask off everything but output_segment
-        if(!mask){
-          result.get_raw_value() = 0;
-        }
-
-        return result;
-      }
-
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_divide_nm(self_type den, camp::idx_t segbits, camp::idx_t num_inner, camp::idx_t num_outer) const
-      {
-        self_type result;
-
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          // nop
-        }
-        else{
-          result.get_raw_value() = m_value / den.get_raw_value();
-        }
-
-        return result;
-      }
-
-
-      /*!
-       * Segmented broadcast copies a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      output_segment=0:
-       *      Result= x0, x1, x0, x1, x0, x1, x0, x1
-       *
-       *      output_segment=1:
-       *      Result= x2, x3, x2, x3, x2, x3, x2, x3
-       *
-       *      output_segment=3:
-       *      Result= x6, x7, x6, x7, x6, x7, x6, x7
-       *
-       *  and so on up to segbits=2, the input segments are 4 wide:
-       *
-       *      input segments allowed are from 0 or 1
-       *
-       *      output_segment=0:
-       *      Result= x0, x1, x2, x3, x0, x1, x2, x3
-       *
-       *      output_segment=1:
-       *      Result= x4, x5, x6, x7, x4, x5, x6, x7
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_broadcast_inner(camp::idx_t segbits, camp::idx_t input_segment) const
-      {
-        self_type result;
-
-        camp::idx_t mask = (1<<segbits)-1;
-        camp::idx_t offset = input_segment << segbits;
-
-
-        camp::idx_t i = (get_lane()&mask) + offset;
-
-        result.get_raw_value() = __shfl_sync(0xffffffff, m_value, i);
-
-
-        return result;
-      }
-
-
-      /*!
-       * Segmented broadcast spreads a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      output_segment=0:
-       *      Result= x0, x0, x0, x0, x1, x1, x1, x1
-       *
-       *      output_segment=1:
-       *      Result= x2, x2, x2, x2, x3, x3, x3, x3
-       *
-       *      output_segment=3:
-       *      Result= x6, x6, x6, x6, x7, x7, x7, x7
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_broadcast_outer(camp::idx_t segbits, camp::idx_t input_segment) const
-      {
-        self_type result;
-
-        camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
-
-        camp::idx_t i = (get_lane() >> segbits) + offset;
-
-        result.get_raw_value() = __shfl_sync(0xffffffff, m_value, i);
-
-        return result;
-      }
-
-
-
-
-  };
-
-
-
-}   // namespace expt
-
-} // namespace RAJA
-
-
-#endif // Guard
-
-#endif // CUDA
+                              RETURN_TYPE>::type
+      multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(m_value * b.m_value - c.m_value);
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type sum() const
+  {
+    // Allreduce sum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::plus>;
+
+    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type max() const
+  {
+    // Allreduce maximum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::maximum>;
+
+    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type max_n(int N) const
+  {
+    // Allreduce maximum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::maximum>;
+
+    auto ident = RAJA::operators::limits<element_type>::min();
+    auto lane  = get_lane();
+    auto value = lane < N ? m_value : ident;
+    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type vmax(self_type a) const
+  {
+    return self_type {RAJA::max<element_type>(m_value, a.m_value)};
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type min() const
+  {
+    // Allreduce minimum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::minimum>;
+
+    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+  /*!
+   * @brief Returns the largest element from first N lanes
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type min_n(int N) const
+  {
+    // Allreduce minimum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::minimum>;
+
+    auto ident = RAJA::operators::limits<element_type>::max();
+    auto lane  = get_lane();
+    auto value = lane < N ? m_value : ident;
+    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type vmin(self_type a) const
+  {
+    return self_type {RAJA::min<element_type>(m_value, a.m_value)};
+  }
+
+
+  /*!
+   * Provides gather/scatter indices for segmented loads and stores
+   *
+   * THe number of segment bits (segbits) is specified, as well as the
+   * stride between elements in a segment (stride_inner),
+   * and the stride between segments (stride_outer)
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  static int_vector_type s_segmented_offsets(camp::idx_t segbits,
+                                             camp::idx_t stride_inner,
+                                             camp::idx_t stride_outer)
+  {
+    int_vector_type result;
+
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    result.get_raw_value() = seg * stride_outer + i * stride_inner;
+
+    return result;
+  }
+
+
+  /*!
+   * Sum elements within each segment, with segment size defined by segbits.
+   * Stores each segments sum consecutively, but shifed to the
+   * corresponding output_segment slot.
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 is equivalent to the input vector,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=1 sums neighboring pairs of values.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
+   *
+   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+   *      output_segment denotes the vector position of the sum
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_sum_inner(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+
+    // First: tree reduce values within each segment
+    element_type x = m_value;
+    RAJA_UNROLL
+    for (int delta = 1; delta < 1 << segbits; delta = delta << 1)
+    {
+
+      // tree shuffle
+      element_type y = __shfl_sync(0xffffffff, x, get_lane() + delta);
+
+      // reduce
+      x += y;
+    }
+
+    // Second: send result to output segment lanes
+    self_type result;
+    result.get_raw_value() = __shfl_sync(0xffffffff, x, get_lane() << segbits);
+
+    // Third: mask off everything but output_segment
+    //        this is because all output segments are valid at this point
+    // (5-segbits), the 5 is since the warp-width is 32 == 1<<5
+    int our_output_segment = get_lane() >> (5 - segbits);
+    bool in_output_segment = our_output_segment == output_segment;
+    if (!in_output_segment)
+    {
+      result.get_raw_value() = 0;
+    }
+
+    return result;
+  }
+
+  /*!
+   * Sum across segments, with segment size defined by segbits
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 is equivalent to the input vector,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=1 sums strided pairs of values.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0+x4, x1+x5, x2+x6, x3+x7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0+x4, x1+x5, x2+x6, x3+x7
+   *
+   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+   *      output_segment denotes the vector position of the sum
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_sum_outer(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+
+    // First: tree reduce values within each segment
+    element_type x = m_value;
+    RAJA_UNROLL
+    for (int i = 0; i < 5 - segbits; ++i)
+    {
+
+      // tree shuffle
+      int delta      = s_num_elem >> (i + 1);
+      element_type y = __shfl_sync(0xffffffff, x, get_lane() + delta);
+
+      // reduce
+      x += y;
+    }
+
+    // Second: send result to output segment lanes
+    self_type result;
+    int get_from           = get_lane() & ((1 << segbits) - 1);
+    result.get_raw_value() = __shfl_sync(0xffffffff, x, get_from);
+
+    int mask = (get_lane() >> segbits) == output_segment;
+
+
+    // Third: mask off everything but output_segment
+    if (!mask)
+    {
+      result.get_raw_value() = 0;
+    }
+
+    return result;
+  }
+
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_divide_nm(self_type den,
+                                camp::idx_t segbits,
+                                camp::idx_t num_inner,
+                                camp::idx_t num_outer) const
+  {
+    self_type result;
+
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    if (seg >= num_outer || i >= num_inner)
+    {
+      // nop
+    }
+    else
+    {
+      result.get_raw_value() = m_value / den.get_raw_value();
+    }
+
+    return result;
+  }
+
+
+  /*!
+   * Segmented broadcast copies a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      output_segment=0:
+   *      Result= x0, x1, x0, x1, x0, x1, x0, x1
+   *
+   *      output_segment=1:
+   *      Result= x2, x3, x2, x3, x2, x3, x2, x3
+   *
+   *      output_segment=3:
+   *      Result= x6, x7, x6, x7, x6, x7, x6, x7
+   *
+   *  and so on up to segbits=2, the input segments are 4 wide:
+   *
+   *      input segments allowed are from 0 or 1
+   *
+   *      output_segment=0:
+   *      Result= x0, x1, x2, x3, x0, x1, x2, x3
+   *
+   *      output_segment=1:
+   *      Result= x4, x5, x6, x7, x4, x5, x6, x7
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_broadcast_inner(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
+
+    camp::idx_t mask   = (1 << segbits) - 1;
+    camp::idx_t offset = input_segment << segbits;
+
+
+    camp::idx_t i = (get_lane() & mask) + offset;
+
+    result.get_raw_value() = __shfl_sync(0xffffffff, m_value, i);
+
+
+    return result;
+  }
+
+
+  /*!
+   * Segmented broadcast spreads a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      output_segment=0:
+   *      Result= x0, x0, x0, x0, x1, x1, x1, x1
+   *
+   *      output_segment=1:
+   *      Result= x2, x2, x2, x2, x3, x3, x3, x3
+   *
+   *      output_segment=3:
+   *      Result= x6, x6, x6, x6, x7, x7, x7, x7
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_broadcast_outer(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
+
+    camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
+
+    camp::idx_t i = (get_lane() >> segbits) + offset;
+
+    result.get_raw_value() = __shfl_sync(0xffffffff, m_value, i);
+
+    return result;
+  }
+};
+
+
+}  // namespace expt
+
+}  // namespace RAJA
+
+
+#endif  // Guard
+
+#endif  // CUDA
diff --git a/include/RAJA/policy/tensor/arch/cuda/traits.hpp b/include/RAJA/policy/tensor/arch/cuda/traits.hpp
index 032517677c..8b9c355f44 100644
--- a/include/RAJA/policy/tensor/arch/cuda/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/cuda/traits.hpp
@@ -21,26 +21,29 @@
 #ifndef RAJA_policy_tensor_arch_cuda_traits_HPP
 #define RAJA_policy_tensor_arch_cuda_traits_HPP
 
-namespace RAJA {
-namespace internal {
-namespace expt {
-
-  template<typename T>
-  struct RegisterTraits<RAJA::expt::cuda_warp_register, T>{
-      using element_type = T;
-      using register_policy = RAJA::expt::cuda_warp_register;
-      static constexpr camp::idx_t s_num_elem = 32;
-      static constexpr camp::idx_t s_num_bits = sizeof(T) * s_num_elem;
-      using int_element_type = int32_t;
-  };
-
-} // namespace internal
-} // namespace expt
-} // namespace RAJA
-
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
+
+template <typename T>
+struct RegisterTraits<RAJA::expt::cuda_warp_register, T>
+{
+  using element_type                      = T;
+  using register_policy                   = RAJA::expt::cuda_warp_register;
+  static constexpr camp::idx_t s_num_elem = 32;
+  static constexpr camp::idx_t s_num_bits = sizeof(T) * s_num_elem;
+  using int_element_type                  = int32_t;
+};
+
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 
 #endif
 
 
-#endif // RAJA_ENABLE_CUDA
+#endif  // RAJA_ENABLE_CUDA
diff --git a/include/RAJA/policy/tensor/arch/hip.hpp b/include/RAJA/policy/tensor/arch/hip.hpp
index 6e76772a29..3ddf27e39c 100644
--- a/include/RAJA/policy/tensor/arch/hip.hpp
+++ b/include/RAJA/policy/tensor/arch/hip.hpp
@@ -21,11 +21,11 @@
 #ifndef RAJA_policy_tensor_arch_hip_HPP
 #define RAJA_policy_tensor_arch_hip_HPP
 
-#include<RAJA/policy/tensor/arch/hip/traits.hpp>
-#include<RAJA/policy/tensor/arch/hip/hip_wave.hpp>
+#include <RAJA/policy/tensor/arch/hip/traits.hpp>
+#include <RAJA/policy/tensor/arch/hip/hip_wave.hpp>
 
 
 #endif
 
 
-#endif // RAJA_ENABLE_CUDA
+#endif  // RAJA_ENABLE_CUDA
diff --git a/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp b/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp
index 74bbc2f077..6cf48ea358 100644
--- a/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp
+++ b/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp
@@ -30,983 +30,1021 @@
 #define RAJA_policy_tensor_arch_hip_hip_wave_register_HPP
 
 
-
 namespace RAJA
 {
 namespace expt
 {
 
 
-  template<typename ELEMENT_TYPE>
-  class Register<ELEMENT_TYPE, hip_wave_register> :
-    public internal::expt::RegisterBase<Register<ELEMENT_TYPE, hip_wave_register>>
+template <typename ELEMENT_TYPE>
+class Register<ELEMENT_TYPE, hip_wave_register>
+    : public internal::expt::RegisterBase<
+          Register<ELEMENT_TYPE, hip_wave_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<ELEMENT_TYPE, hip_wave_register>>;
+
+  using register_policy = hip_wave_register;
+  using self_type       = Register<ELEMENT_TYPE, hip_wave_register>;
+  using element_type    = ELEMENT_TYPE;
+  using register_type   = ELEMENT_TYPE;
+
+  using int_vector_type = Register<int64_t, hip_wave_register>;
+
+
+private:
+  element_type m_value;
+
+public:
+  static constexpr int s_num_elem = 64;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr Register() : base_type(), m_value(0) {}
+
+
+  /*!
+   * @brief Copy constructor from raw value
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr Register(element_type c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+
+  /*!
+   * @brief Copy assignment operator
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& operator=(self_type const& c)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<ELEMENT_TYPE, hip_wave_register>>;
-
-      using register_policy = hip_wave_register;
-      using self_type = Register<ELEMENT_TYPE, hip_wave_register>;
-      using element_type = ELEMENT_TYPE;
-      using register_type = ELEMENT_TYPE;
-
-      using int_vector_type = Register<int64_t, hip_wave_register>;
-
+    m_value = c.m_value;
+    return *this;
+  }
 
-		private:
-      element_type m_value;
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& operator=(element_type c)
+  {
+    m_value = c;
+    return *this;
+  }
+
+  /*!
+   * @brief Gets our warp lane
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr static int get_lane() { return threadIdx.x; }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  constexpr element_type const& get_raw_value() const { return m_value; }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  element_type& get_raw_value() { return m_value; }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  static constexpr bool is_root() { return get_lane() == 0; }
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& load_packed(element_type const* ptr)
+  {
+
+    auto lane = get_lane();
+
+    m_value = ptr[lane];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& load_packed_n(element_type const* ptr, int N)
+  {
+    auto lane = get_lane();
+    if (lane < N)
+    {
+      m_value = ptr[lane];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& load_strided(element_type const* ptr, int stride)
+  {
 
-		public:
+    auto lane = get_lane();
 
-      static constexpr int s_num_elem = 64;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register() : base_type(), m_value(0) {
-
-      }
-
-
-      /*!
-       * @brief Copy constructor from raw value
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register(element_type c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-
-      /*!
-       * @brief Copy assignment operator
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &operator=(element_type c){
-        m_value = c;
-        return *this;
-      }
-
-      /*!
-       * @brief Gets our warp lane
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      static
-      int get_lane() {
-        return threadIdx.x;
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      constexpr
-      element_type const &get_raw_value() const {
-        return m_value;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      element_type &get_raw_value() {
-        return m_value;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_root() {
-        return get_lane() == 0;
-      }
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_packed(element_type const *ptr){
-
-        auto lane = get_lane();
-
-        m_value = ptr[lane];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_packed_n(element_type const *ptr, int N){
-        auto lane = get_lane();
-        if(lane < N){
-          m_value = ptr[lane];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_strided(element_type const *ptr, int stride){
-
-        auto lane = get_lane();
-
-        m_value = ptr[stride*lane];
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_strided_n(element_type const *ptr, int stride, int N){
-        auto lane = get_lane();
-
-        if(lane < N){
-          m_value = ptr[stride*lane];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
-
-        m_value = ptr[offsets.get_raw_value()];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
-        if(get_lane() < N){
-          m_value = ptr[offsets.get_raw_value()];
-        }
-        else{
-          m_value = element_type(0);
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays.
-       *
-       * The default operation combines the s_segmented_offsets and gather
-       * operations.
-       *
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load(element_type const *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer){
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        m_value = ptr[seg*stride_outer + i*stride_inner];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays where we load partial segments.
-       *
-       *
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load_nm(element_type const *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer)
-      {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          m_value = element_type(0);
-        }
-        else{
-          m_value = ptr[seg*stride_outer + i*stride_inner];
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_packed(element_type *ptr) const{
-
-        auto lane = get_lane();
-
-        ptr[lane] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_packed_n(element_type *ptr, int N) const{
-
-        auto lane = get_lane();
-
-        if(lane < N){
-          ptr[lane] = m_value;
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_strided(element_type *ptr, int stride) const{
-
-        auto lane = get_lane();
-
-        ptr[lane*stride] = m_value;
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_strided_n(element_type *ptr, int stride, int N) const{
-
-        auto lane = get_lane();
-
-        if(lane < N){
-          ptr[lane*stride] = m_value;
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic scatter operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &scatter(element_type *ptr, T2 const &offsets) const {
-
-        ptr[offsets.get_raw_value()] = m_value;
-
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic scatter operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &scatter_n(element_type *ptr, T2 const &offsets, camp::idx_t N) const {
-        if(get_lane() < N){
-          ptr[offsets.get_raw_value()] = m_value;
-        }
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented store operation used for storing sub-matrices
-       * to larger arrays.
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store(element_type *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer) const {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        ptr[seg*stride_outer + i*stride_inner] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented store operation used for storing sub-matrices
-       * to larger arrays where we store partial segments.
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store_nm(element_type *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer) const
-      {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          // nop
-        }
-        else{
-          ptr[seg*stride_outer + i*stride_inner] = m_value;
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      constexpr
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type get(int i) const
-			{
-        return hip::impl::shfl_sync(m_value, i);
-			}
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &set(element_type value, int i)
-			{
-				auto lane = get_lane();
-      	if(lane == i){
-					m_value = value;
-				}
-        return *this;
-			}
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &a){
-        m_value = a;
-        return *this;
-      }
-
-      /*!
-       * @brief Extracts a scalar value and broadcasts to a new register
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type get_and_broadcast(int i) const {
-        self_type x;
-        x.m_value = hip::impl::shfl_sync(m_value, i, 32);
-        return x;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(m_value + b.m_value);
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(m_value - b.m_value);
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(m_value * b.m_value);
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(m_value / b.m_value);
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, int N) const {
-        return get_lane() < N ? self_type(m_value / b.m_value) : self_type(element_type(0));
-      }
-
-      /**
-       * floats and doubles use the CUDA instrinsic FMA
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+    m_value = ptr[stride * lane];
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& load_strided_n(element_type const* ptr, int stride, int N)
+  {
+    auto lane = get_lane();
+
+    if (lane < N)
+    {
+      m_value = ptr[stride * lane];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& gather(element_type const* ptr, int_vector_type offsets)
+  {
+
+    m_value = ptr[offsets.get_raw_value()];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type&
+  gather_n(element_type const* ptr, int_vector_type offsets, camp::idx_t N)
+  {
+    if (get_lane() < N)
+    {
+      m_value = ptr[offsets.get_raw_value()];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays.
+   *
+   * The default operation combines the s_segmented_offsets and gather
+   * operations.
+   *
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type& segmented_load(element_type const* ptr,
+                            camp::idx_t segbits,
+                            camp::idx_t stride_inner,
+                            camp::idx_t stride_outer)
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    m_value = ptr[seg * stride_outer + i * stride_inner];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays where we load partial segments.
+   *
+   *
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type& segmented_load_nm(element_type const* ptr,
+                               camp::idx_t segbits,
+                               camp::idx_t stride_inner,
+                               camp::idx_t stride_outer,
+                               camp::idx_t num_inner,
+                               camp::idx_t num_outer)
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    if (seg >= num_outer || i >= num_inner)
+    {
+      m_value = element_type(0);
+    }
+    else
+    {
+      m_value = ptr[seg * stride_outer + i * stride_inner];
+    }
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const& store_packed(element_type* ptr) const
+  {
+
+    auto lane = get_lane();
+
+    ptr[lane] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const& store_packed_n(element_type* ptr, int N) const
+  {
+
+    auto lane = get_lane();
+
+    if (lane < N)
+    {
+      ptr[lane] = m_value;
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const& store_strided(element_type* ptr, int stride) const
+  {
+
+    auto lane = get_lane();
+
+    ptr[lane * stride] = m_value;
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const& store_strided_n(element_type* ptr, int stride, int N) const
+  {
+
+    auto lane = get_lane();
+
+    if (lane < N)
+    {
+      ptr[lane * stride] = m_value;
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic scatter operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_DEVICE RAJA_INLINE self_type const& scatter(element_type* ptr,
+                                                   T2 const& offsets) const
+  {
+
+    ptr[offsets.get_raw_value()] = m_value;
+
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic scatter operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_DEVICE RAJA_INLINE self_type const&
+  scatter_n(element_type* ptr, T2 const& offsets, camp::idx_t N) const
+  {
+    if (get_lane() < N)
+    {
+      ptr[offsets.get_raw_value()] = m_value;
+    }
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented store operation used for storing sub-matrices
+   * to larger arrays.
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type const& segmented_store(element_type* ptr,
+                                   camp::idx_t segbits,
+                                   camp::idx_t stride_inner,
+                                   camp::idx_t stride_outer) const
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    ptr[seg * stride_outer + i * stride_inner] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented store operation used for storing sub-matrices
+   * to larger arrays where we store partial segments.
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type const& segmented_store_nm(element_type* ptr,
+                                      camp::idx_t segbits,
+                                      camp::idx_t stride_inner,
+                                      camp::idx_t stride_outer,
+                                      camp::idx_t num_inner,
+                                      camp::idx_t num_outer) const
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    if (seg >= num_outer || i >= num_inner)
+    {
+      // nop
+    }
+    else
+    {
+      ptr[seg * stride_outer + i * stride_inner] = m_value;
+    }
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  constexpr RAJA_INLINE RAJA_DEVICE element_type get(int i) const
+  {
+    return hip::impl::shfl_sync(m_value, i);
+  }
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& set(element_type value, int i)
+  {
+    auto lane = get_lane();
+    if (lane == i)
+    {
+      m_value = value;
+    }
+    return *this;
+  }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& a)
+  {
+    m_value = a;
+    return *this;
+  }
+
+  /*!
+   * @brief Extracts a scalar value and broadcasts to a new register
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type get_and_broadcast(int i) const
+  {
+    self_type x;
+    x.m_value = hip::impl::shfl_sync(m_value, i, 32);
+    return x;
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(m_value + b.m_value);
+  }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(m_value - b.m_value);
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(m_value * b.m_value);
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(m_value / b.m_value);
+  }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, int N) const
+  {
+    return get_lane() < N ? self_type(m_value / b.m_value)
+                          : self_type(element_type(0));
+  }
+
+  /**
+   * floats and doubles use the CUDA instrinsic FMA
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(fma(m_value, b.m_value, c.m_value));
-      }
-
-      /**
-       * int32 and int64 don't have a CUDA intrinsic FMA, do unfused ops
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+                              RETURN_TYPE>::type
+      multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(fma(m_value, b.m_value, c.m_value));
+  }
+
+  /**
+   * int32 and int64 don't have a CUDA intrinsic FMA, do unfused ops
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(m_value * b.m_value + c.m_value);
-      }
-
-      /**
-       * floats and doubles use the CUDA instrinsic FMS
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+                              RETURN_TYPE>::type
+      multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(m_value * b.m_value + c.m_value);
+  }
+
+  /**
+   * floats and doubles use the CUDA instrinsic FMS
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(fma(m_value, b.m_value, -c.m_value));
-      }
-
-      /**
-       * int32 and int64 don't have a CUDA intrinsic FMS, do unfused ops
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+                              RETURN_TYPE>::type
+      multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(fma(m_value, b.m_value, -c.m_value));
+  }
+
+  /**
+   * int32 and int64 don't have a CUDA intrinsic FMS, do unfused ops
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(m_value * b.m_value - c.m_value);
-      }
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type sum() const
-      {
-				// Allreduce sum
-				using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::plus>;
-
-				return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
-      }
-
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type max() const
-      {
-        // Allreduce maximum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::maximum>;
-
-        return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type max_n(int N) const
-      {
-        // Allreduce maximum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::maximum>;
-
-        auto ident = RAJA::operators::limits<element_type>::min();
-        auto lane = get_lane();
-        auto value = lane < N ? m_value : ident;
-        return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type vmax(self_type a) const
-      {
-        return self_type{RAJA::max<element_type>(m_value, a.m_value)};
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type min() const
-      {
-        // Allreduce minimum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::minimum>;
-
-        return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
-
-      }
-
-      /*!
-       * @brief Returns the largest element from first N lanes
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type min_n(int N) const
-      {
-        // Allreduce minimum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::minimum>;
-
-        auto ident = RAJA::operators::limits<element_type>::max();
-        auto lane = get_lane();
-        auto value = lane < N ? m_value : ident;
-        return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type vmin(self_type a) const
-      {
-        return self_type{RAJA::min<element_type>(m_value, a.m_value)};
-      }
-
-
-
-
-      /*!
-       * Provides gather/scatter indices for segmented loads and stores
-       *
-       * THe number of segment bits (segbits) is specified, as well as the
-       * stride between elements in a segment (stride_inner),
-       * and the stride between segments (stride_outer)
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      static
-      int_vector_type s_segmented_offsets(camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer)
-      {
-        int_vector_type result;
-
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        result.get_raw_value() = seg*stride_outer + i*stride_inner;
-
-        return result;
-      }
-
-
-      /*!
-       * Sum elements within each segment, with segment size defined by segbits.
-       * Stores each segments sum consecutively, but shifed to the
-       * corresponding output_segment slot.
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 is equivalent to the input vector,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=1 sums neighboring pairs of values.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
-       *
-       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-       *      output_segment denotes the vector position of the sum
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_sum_inner(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-
-        // First: tree reduce values within each segment
-        element_type x = m_value;
-        RAJA_UNROLL
-        for(int delta = 1;delta < 1<<segbits;delta = delta<<1){
-
-          // tree shuffle
-          element_type y = hip::impl::shfl_sync(x, get_lane()+delta);
-
-          // reduce
-          x += y;
-        }
-
-        // Second: send result to output segment lanes
-        self_type result;
-        result.get_raw_value() = hip::impl::shfl_sync(x, get_lane()<<segbits);
-
-        // Third: mask off everything but output_segment
-        //        this is because all output segments are valid at this point
-        // (5-segbits), the 5 is since the warp-width is 32 == 1<<5
-        int our_output_segment = get_lane()>>(6-segbits);
-        bool in_output_segment = our_output_segment == output_segment;
-        if(!in_output_segment){
-          result.get_raw_value() = 0;
-        }
-
-        return result;
-      }
-
-      /*!
-       * Sum across segments, with segment size defined by segbits
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 is equivalent to the input vector,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=1 sums strided pairs of values.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0+x4, x1+x5, x2+x6, x3+x7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0+x4, x1+x5, x2+x6, x3+x7
-       *
-       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-       *      output_segment denotes the vector position of the sum
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_sum_outer(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-
-        // First: tree reduce values within each segment
-        element_type x = m_value;
-        RAJA_UNROLL
-        for(int i = 0;i < 6-segbits; ++ i){
-
-          // tree shuffle
-          int delta = s_num_elem >> (i+1);
-          element_type y = hip::impl::shfl_sync(x, get_lane()+delta);
-
-          // reduce
-          x += y;
-        }
-
-        // Second: send result to output segment lanes
-        self_type result;
-        int get_from = get_lane()&( (1<<segbits)-1);
-        result.get_raw_value() = hip::impl::shfl_sync(x, get_from);
-
-        int mask = (get_lane()>>segbits) == output_segment;
-
-
-        // Third: mask off everything but output_segment
-        if(!mask){
-          result.get_raw_value() = 0;
-        }
-
-        return result;
-      }
-
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_divide_nm(self_type den, camp::idx_t segbits, camp::idx_t num_inner, camp::idx_t num_outer) const
-      {
-        self_type result;
-
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          // nop
-        }
-        else{
-          result.get_raw_value() = m_value / den.get_raw_value();
-        }
-
-        return result;
-      }
-
-
-      /*!
-       * Segmented broadcast copies a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      output_segment=0:
-       *      Result= x0, x1, x0, x1, x0, x1, x0, x1
-       *
-       *      output_segment=1:
-       *      Result= x2, x3, x2, x3, x2, x3, x2, x3
-       *
-       *      output_segment=3:
-       *      Result= x6, x7, x6, x7, x6, x7, x6, x7
-       *
-       *  and so on up to segbits=2, the input segments are 4 wide:
-       *
-       *      input segments allowed are from 0 or 1
-       *
-       *      output_segment=0:
-       *      Result= x0, x1, x2, x3, x0, x1, x2, x3
-       *
-       *      output_segment=1:
-       *      Result= x4, x5, x6, x7, x4, x5, x6, x7
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_broadcast_inner(camp::idx_t segbits, camp::idx_t input_segment) const
-      {
-        self_type result;
-
-        camp::idx_t mask = (1<<segbits)-1;
-        camp::idx_t offset = input_segment << segbits;
-
-
-        camp::idx_t i = (get_lane()&mask) + offset;
-
-        result.get_raw_value() = hip::impl::shfl_sync(m_value, i);
-
-
-        return result;
-      }
-
-
-      /*!
-       * Segmented broadcast spreads a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      output_segment=0:
-       *      Result= x0, x0, x0, x0, x1, x1, x1, x1
-       *
-       *      output_segment=1:
-       *      Result= x2, x2, x2, x2, x3, x3, x3, x3
-       *
-       *      output_segment=3:
-       *      Result= x6, x6, x6, x6, x7, x7, x7, x7
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_broadcast_outer(camp::idx_t segbits, camp::idx_t input_segment) const
-      {
-        self_type result;
-
-        camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
-
-        camp::idx_t i = (get_lane() >> segbits) + offset;
-
-        result.get_raw_value() = hip::impl::shfl_sync(m_value, i);
-
-        return result;
-      }
-
-
-
-
-  };
-
-
-
-}   // namespace expt
-
-} // namespace RAJA
-
-
-#endif // Guard
-
-#endif // HIP
+                              RETURN_TYPE>::type
+      multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(m_value * b.m_value - c.m_value);
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type sum() const
+  {
+    // Allreduce sum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::plus>;
+
+    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type max() const
+  {
+    // Allreduce maximum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::maximum>;
+
+    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type max_n(int N) const
+  {
+    // Allreduce maximum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::maximum>;
+
+    auto ident = RAJA::operators::limits<element_type>::min();
+    auto lane  = get_lane();
+    auto value = lane < N ? m_value : ident;
+    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type vmax(self_type a) const
+  {
+    return self_type {RAJA::max<element_type>(m_value, a.m_value)};
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type min() const
+  {
+    // Allreduce minimum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::minimum>;
+
+    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+  /*!
+   * @brief Returns the largest element from first N lanes
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type min_n(int N) const
+  {
+    // Allreduce minimum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::minimum>;
+
+    auto ident = RAJA::operators::limits<element_type>::max();
+    auto lane  = get_lane();
+    auto value = lane < N ? m_value : ident;
+    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type vmin(self_type a) const
+  {
+    return self_type {RAJA::min<element_type>(m_value, a.m_value)};
+  }
+
+
+  /*!
+   * Provides gather/scatter indices for segmented loads and stores
+   *
+   * THe number of segment bits (segbits) is specified, as well as the
+   * stride between elements in a segment (stride_inner),
+   * and the stride between segments (stride_outer)
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  static int_vector_type s_segmented_offsets(camp::idx_t segbits,
+                                             camp::idx_t stride_inner,
+                                             camp::idx_t stride_outer)
+  {
+    int_vector_type result;
+
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    result.get_raw_value() = seg * stride_outer + i * stride_inner;
+
+    return result;
+  }
+
+
+  /*!
+   * Sum elements within each segment, with segment size defined by segbits.
+   * Stores each segments sum consecutively, but shifed to the
+   * corresponding output_segment slot.
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 is equivalent to the input vector,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=1 sums neighboring pairs of values.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
+   *
+   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+   *      output_segment denotes the vector position of the sum
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_sum_inner(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+
+    // First: tree reduce values within each segment
+    element_type x = m_value;
+    RAJA_UNROLL
+    for (int delta = 1; delta < 1 << segbits; delta = delta << 1)
+    {
+
+      // tree shuffle
+      element_type y = hip::impl::shfl_sync(x, get_lane() + delta);
+
+      // reduce
+      x += y;
+    }
+
+    // Second: send result to output segment lanes
+    self_type result;
+    result.get_raw_value() = hip::impl::shfl_sync(x, get_lane() << segbits);
+
+    // Third: mask off everything but output_segment
+    //        this is because all output segments are valid at this point
+    // (5-segbits), the 5 is since the warp-width is 32 == 1<<5
+    int our_output_segment = get_lane() >> (6 - segbits);
+    bool in_output_segment = our_output_segment == output_segment;
+    if (!in_output_segment)
+    {
+      result.get_raw_value() = 0;
+    }
+
+    return result;
+  }
+
+  /*!
+   * Sum across segments, with segment size defined by segbits
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 is equivalent to the input vector,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=1 sums strided pairs of values.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0+x4, x1+x5, x2+x6, x3+x7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0+x4, x1+x5, x2+x6, x3+x7
+   *
+   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+   *      output_segment denotes the vector position of the sum
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_sum_outer(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+
+    // First: tree reduce values within each segment
+    element_type x = m_value;
+    RAJA_UNROLL
+    for (int i = 0; i < 6 - segbits; ++i)
+    {
+
+      // tree shuffle
+      int delta      = s_num_elem >> (i + 1);
+      element_type y = hip::impl::shfl_sync(x, get_lane() + delta);
+
+      // reduce
+      x += y;
+    }
+
+    // Second: send result to output segment lanes
+    self_type result;
+    int get_from           = get_lane() & ((1 << segbits) - 1);
+    result.get_raw_value() = hip::impl::shfl_sync(x, get_from);
+
+    int mask = (get_lane() >> segbits) == output_segment;
+
+
+    // Third: mask off everything but output_segment
+    if (!mask)
+    {
+      result.get_raw_value() = 0;
+    }
+
+    return result;
+  }
+
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_divide_nm(self_type den,
+                                camp::idx_t segbits,
+                                camp::idx_t num_inner,
+                                camp::idx_t num_outer) const
+  {
+    self_type result;
+
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    if (seg >= num_outer || i >= num_inner)
+    {
+      // nop
+    }
+    else
+    {
+      result.get_raw_value() = m_value / den.get_raw_value();
+    }
+
+    return result;
+  }
+
+
+  /*!
+   * Segmented broadcast copies a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      output_segment=0:
+   *      Result= x0, x1, x0, x1, x0, x1, x0, x1
+   *
+   *      output_segment=1:
+   *      Result= x2, x3, x2, x3, x2, x3, x2, x3
+   *
+   *      output_segment=3:
+   *      Result= x6, x7, x6, x7, x6, x7, x6, x7
+   *
+   *  and so on up to segbits=2, the input segments are 4 wide:
+   *
+   *      input segments allowed are from 0 or 1
+   *
+   *      output_segment=0:
+   *      Result= x0, x1, x2, x3, x0, x1, x2, x3
+   *
+   *      output_segment=1:
+   *      Result= x4, x5, x6, x7, x4, x5, x6, x7
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_broadcast_inner(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
+
+    camp::idx_t mask   = (1 << segbits) - 1;
+    camp::idx_t offset = input_segment << segbits;
+
+
+    camp::idx_t i = (get_lane() & mask) + offset;
+
+    result.get_raw_value() = hip::impl::shfl_sync(m_value, i);
+
+
+    return result;
+  }
+
+
+  /*!
+   * Segmented broadcast spreads a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      output_segment=0:
+   *      Result= x0, x0, x0, x0, x1, x1, x1, x1
+   *
+   *      output_segment=1:
+   *      Result= x2, x2, x2, x2, x3, x3, x3, x3
+   *
+   *      output_segment=3:
+   *      Result= x6, x6, x6, x6, x7, x7, x7, x7
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_broadcast_outer(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
+
+    camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
+
+    camp::idx_t i = (get_lane() >> segbits) + offset;
+
+    result.get_raw_value() = hip::impl::shfl_sync(m_value, i);
+
+    return result;
+  }
+};
+
+
+}  // namespace expt
+
+}  // namespace RAJA
+
+
+#endif  // Guard
+
+#endif  // HIP
diff --git a/include/RAJA/policy/tensor/arch/hip/traits.hpp b/include/RAJA/policy/tensor/arch/hip/traits.hpp
index 4c4d959599..dc4d0d63d1 100644
--- a/include/RAJA/policy/tensor/arch/hip/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/hip/traits.hpp
@@ -21,26 +21,29 @@
 #ifndef RAJA_policy_tensor_arch_hip_traits_HPP
 #define RAJA_policy_tensor_arch_hip_traits_HPP
 
-namespace RAJA {
-namespace internal {
-namespace expt {
-
-  template<typename T>
-  struct RegisterTraits<RAJA::expt::hip_wave_register, T>{
-      using element_type = T;
-      using register_policy = RAJA::expt::hip_wave_register;
-      static constexpr camp::idx_t s_num_elem = 64;
-      static constexpr camp::idx_t s_num_bits = sizeof(T) * s_num_elem;
-      using int_element_type = int32_t;
-  };
-
-} // namespace internal
-} // namespace expt
-} // namespace RAJA
-
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
+
+template <typename T>
+struct RegisterTraits<RAJA::expt::hip_wave_register, T>
+{
+  using element_type                      = T;
+  using register_policy                   = RAJA::expt::hip_wave_register;
+  static constexpr camp::idx_t s_num_elem = 64;
+  static constexpr camp::idx_t s_num_bits = sizeof(T) * s_num_elem;
+  using int_element_type                  = int32_t;
+};
+
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 
 #endif
 
 
-#endif // RAJA_ENABLE_HIP
+#endif  // RAJA_ENABLE_HIP
diff --git a/include/RAJA/policy/tensor/arch/scalar.hpp b/include/RAJA/policy/tensor/arch/scalar.hpp
index 5e139f41f0..29b3788e80 100644
--- a/include/RAJA/policy/tensor/arch/scalar.hpp
+++ b/include/RAJA/policy/tensor/arch/scalar.hpp
@@ -16,16 +16,12 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 
-
 #ifndef RAJA_policy_tensor_arch_scalar_HPP
 #define RAJA_policy_tensor_arch_scalar_HPP
 
 
-
-#include<RAJA/policy/tensor/arch/scalar/traits.hpp>
-#include<RAJA/policy/tensor/arch/scalar/scalar.hpp>
+#include <RAJA/policy/tensor/arch/scalar/traits.hpp>
+#include <RAJA/policy/tensor/arch/scalar/scalar.hpp>
 
 
 #endif
-
-
diff --git a/include/RAJA/policy/tensor/arch/scalar/scalar.hpp b/include/RAJA/policy/tensor/arch/scalar/scalar.hpp
index 139c5d27a5..d63b78c9f4 100644
--- a/include/RAJA/policy/tensor/arch/scalar/scalar.hpp
+++ b/include/RAJA/policy/tensor/arch/scalar/scalar.hpp
@@ -22,449 +22,464 @@
 
 namespace RAJA
 {
-namespace expt {
+namespace expt
+{
+
+/**
+ * A specialization for a single element register.
+ * We will implement this as a scalar value, and let the compiler use
+ * whatever registers it deems appropriate.
+ */
+template <typename T>
+class Register<T, scalar_register>
+    : public internal::expt::RegisterBase<Register<T, scalar_register>>
+{
+public:
+  using base_type = internal::expt::RegisterBase<Register<T, scalar_register>>;
+
+  using register_policy = scalar_register;
+  using self_type       = Register<T, scalar_register>;
+  using element_type    = T;
+  using register_type   = T;
+
+  using int_vector_type =
+      Register<typename internal::expt::RegisterTraits<scalar_register,
+                                                       T>::int_element_type,
+               scalar_register>;
+
+
+private:
+  T m_value;
+
+public:
+  static constexpr camp::idx_t s_num_elem = 1;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr Register() : base_type(), m_value(0) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr Register(element_type const& c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = ptr[0];
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    if (N > 0)
+    {
+      m_value = ptr[0];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t)
+  {
+    m_value = ptr[0];
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_strided_n(element_type const* ptr, camp::idx_t, camp::idx_t N)
+  {
+    if (N > 0)
+    {
+      m_value = ptr[0];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type& gather(element_type const* ptr, int_vector_type offsets)
+  {
+
+    m_value = ptr[offsets.get(0)];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  gather_n(element_type const* ptr, int_vector_type offsets, camp::idx_t N)
+  {
+    if (N > 0)
+    {
+      m_value = ptr[offsets.get(0)];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    ptr[0] = m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    if (N > 0)
+    {
+      ptr[0] = m_value;
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t) const
+  {
+    ptr[0] = m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t, camp::idx_t N) const
+  {
+    if (N > 0)
+    {
+      ptr[0] = m_value;
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic scatter operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type const& scatter(element_type* ptr, int_vector_type offsets) const
+  {
+
+    ptr[offsets.get(0)] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic scatter operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  scatter_n(element_type* ptr, int_vector_type offsets, camp::idx_t N) const
+  {
+    if (N > 0)
+    {
+      ptr[offsets.get(0)] = m_value;
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  constexpr RAJA_INLINE RAJA_HOST_DEVICE element_type get(camp::idx_t) const
+  {
+    return m_value;
+  }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type& set(element_type value, camp::idx_t)
+  {
+    m_value = value;
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& a)
+  {
+    m_value = a;
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(m_value + b.m_value);
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(m_value - b.m_value);
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(m_value * b.m_value);
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(m_value / b.m_value);
+  }
+
+  /*!
+   * @brief Fused multiply add: fma(b, c) = (*this)*b+c
+   *
+   * Derived types can override this to implement intrinsic FMA's
+   *
+   * @param b Second product operand
+   * @param c Sum operand
+   * @return Value of (*this)*b+c
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const& b, self_type const& c) const
+  {
+    return m_value * b.m_value + c.m_value;
+  }
+
+  /*!
+   * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
+   *
+   * Derived types can override this to implement intrinsic FMS's
+   *
+   * @param b Second product operand
+   * @param c Subtraction operand
+   * @return Value of (*this)*b-c
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return m_value * b.m_value - c.m_value;
+  }
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr element_type sum() const { return m_value; }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr element_type dot(self_type const& b) const
+  {
+    return m_value * b.m_value;
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr element_type max() const { return m_value; }
+
+  /*!
+   * @brief Returns the largest element from first N lanes
+   * @return The largest scalar element in the register
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    return N ? m_value : RAJA::operators::limits<element_type>::min();
+    ;
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(RAJA::max<element_type>(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the smallest element
+   * @return The smallest scalar element in the register
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  element_type min() const { return m_value; }
+
+  /*!
+   * @brief Returns the smallest element from first N lanes
+   * @return The smallest scalar element in the register
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    return N ? m_value : RAJA::operators::limits<element_type>::max();
+    ;
+  }
 
-  /**
-   * A specialization for a single element register.
-   * We will implement this as a scalar value, and let the compiler use
-   * whatever registers it deems appropriate.
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
    */
-  template<typename T>
-  class Register<T, scalar_register> :
-      public internal::expt::RegisterBase<Register<T, scalar_register>>
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type vmin(self_type a) const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<T, scalar_register>>;
-
-      using register_policy = scalar_register;
-      using self_type = Register<T, scalar_register>;
-      using element_type = T;
-      using register_type = T;
-
-      using int_vector_type = Register<typename internal::expt::RegisterTraits<scalar_register, T>::int_element_type, scalar_register>;
-
-
-    private:
-      T m_value;
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 1;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      Register() : base_type(), m_value(0) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      Register(element_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = ptr[0];
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        if(N > 0){
-          m_value = ptr[0];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t ){
-        m_value = ptr[0];
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t , camp::idx_t N){
-        if(N > 0){
-          m_value = ptr[0];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
-
-        m_value = ptr[offsets.get(0)];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
-        if(N > 0){
-          m_value = ptr[offsets.get(0)];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        ptr[0] = m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        if(N > 0){
-          ptr[0] = m_value;
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t ) const{
-        ptr[0] = m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t , camp::idx_t N) const{
-        if(N > 0){
-          ptr[0] = m_value;
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic scatter operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type const &scatter(element_type *ptr, int_vector_type offsets) const {
-
-        ptr[offsets.get(0)] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic scatter operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type const &scatter_n(element_type *ptr, int_vector_type offsets, camp::idx_t N) const {
-        if(N > 0){
-          ptr[offsets.get(0)] = m_value;
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      constexpr
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type get(camp::idx_t) const
-      {return m_value;}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &set(element_type value, camp::idx_t)
-      {
-        m_value = value;
-        return *this;
-      }
-
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &a){
-        m_value = a;
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(m_value + b.m_value);
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(m_value - b.m_value);
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(m_value * b.m_value);
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(m_value / b.m_value);
-      }
-
-      /*!
-       * @brief Fused multiply add: fma(b, c) = (*this)*b+c
-       *
-       * Derived types can override this to implement intrinsic FMA's
-       *
-       * @param b Second product operand
-       * @param c Sum operand
-       * @return Value of (*this)*b+c
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return m_value * b.m_value + c.m_value;
-      }
-
-      /*!
-       * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
-       *
-       * Derived types can override this to implement intrinsic FMS's
-       *
-       * @param b Second product operand
-       * @param c Subtraction operand
-       * @return Value of (*this)*b-c
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return m_value * b.m_value - c.m_value;
-      }
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      element_type sum() const
-      {
-        return m_value;
-      }
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      element_type dot(self_type const &b) const
-      {
-        return m_value * b.m_value;
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      element_type max() const
-      {
-        return m_value;
-      }
-
-      /*!
-       * @brief Returns the largest element from first N lanes
-       * @return The largest scalar element in the register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        return N ? m_value : RAJA::operators::limits<element_type>::min();;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(RAJA::max<element_type>(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the smallest element
-       * @return The smallest scalar element in the register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type min() const
-      {
-        return m_value;
-      }
-
-      /*!
-       * @brief Returns the smallest element from first N lanes
-       * @return The smallest scalar element in the register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        return N ? m_value : RAJA::operators::limits<element_type>::max();;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(RAJA::min<element_type>(m_value, a.m_value));
-      }
-
-
-
-  };
-} // namespace expt
+    return self_type(RAJA::min<element_type>(m_value, a.m_value));
+  }
+};
+}  // namespace expt
 }  // namespace RAJA
 
 
diff --git a/include/RAJA/policy/tensor/arch/scalar/traits.hpp b/include/RAJA/policy/tensor/arch/scalar/traits.hpp
index dfeccbb86f..92496eeae3 100644
--- a/include/RAJA/policy/tensor/arch/scalar/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/scalar/traits.hpp
@@ -19,52 +19,57 @@
 #ifndef RAJA_policy_tensor_arch_scalar_traits_HPP
 #define RAJA_policy_tensor_arch_scalar_traits_HPP
 
-namespace RAJA {
-namespace internal {
-namespace expt {
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
 
 
-  template<>
-  struct RegisterTraits<RAJA::expt::scalar_register, int32_t>{
-      using element_type = int32_t;
-      using register_policy = RAJA::expt::scalar_register;
-      static constexpr camp::idx_t s_num_bits = sizeof(element_type)*8;
-      static constexpr camp::idx_t s_num_elem = 1;
-      using int_element_type = int32_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::scalar_register, int32_t>
+{
+  using element_type                      = int32_t;
+  using register_policy                   = RAJA::expt::scalar_register;
+  static constexpr camp::idx_t s_num_bits = sizeof(element_type) * 8;
+  static constexpr camp::idx_t s_num_elem = 1;
+  using int_element_type                  = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::scalar_register, int64_t>{
-      using element_type = int64_t;
-      using register_policy = RAJA::expt::scalar_register;
-      static constexpr camp::idx_t s_num_bits = sizeof(element_type)*8;
-      static constexpr camp::idx_t s_num_elem = 1;
-      using int_element_type = int64_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::scalar_register, int64_t>
+{
+  using element_type                      = int64_t;
+  using register_policy                   = RAJA::expt::scalar_register;
+  static constexpr camp::idx_t s_num_bits = sizeof(element_type) * 8;
+  static constexpr camp::idx_t s_num_elem = 1;
+  using int_element_type                  = int64_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::scalar_register, float>{
-      using element_type = float;
-      using register_policy = RAJA::expt::scalar_register;
-      static constexpr camp::idx_t s_num_bits = sizeof(element_type)*8;
-      static constexpr camp::idx_t s_num_elem = 1;
-      using int_element_type = int32_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::scalar_register, float>
+{
+  using element_type                      = float;
+  using register_policy                   = RAJA::expt::scalar_register;
+  static constexpr camp::idx_t s_num_bits = sizeof(element_type) * 8;
+  static constexpr camp::idx_t s_num_elem = 1;
+  using int_element_type                  = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::scalar_register, double>{
-      using element_type = double;
-      using register_policy = RAJA::expt::scalar_register;
-      static constexpr camp::idx_t s_num_bits = sizeof(element_type)*8;
-      static constexpr camp::idx_t s_num_elem = 1;
-      using int_element_type = int64_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::scalar_register, double>
+{
+  using element_type                      = double;
+  using register_policy                   = RAJA::expt::scalar_register;
+  static constexpr camp::idx_t s_num_bits = sizeof(element_type) * 8;
+  static constexpr camp::idx_t s_num_elem = 1;
+  using int_element_type                  = int64_t;
+};
 
 
-}
-}
-}
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 #endif
-
-
diff --git a/include/RAJA/policy/tensor/arch_impl.hpp b/include/RAJA/policy/tensor/arch_impl.hpp
index e14451505a..0e7085b5e2 100644
--- a/include/RAJA/policy/tensor/arch_impl.hpp
+++ b/include/RAJA/policy/tensor/arch_impl.hpp
@@ -22,7 +22,6 @@
 #include "RAJA/policy/tensor/arch.hpp"
 
 
-
 //
 //////////////////////////////////////////////////////////////////////
 //
@@ -32,30 +31,29 @@
 //
 
 #ifdef __AVX512F__
-#include<RAJA/policy/tensor/arch/avx512.hpp>
+#include <RAJA/policy/tensor/arch/avx512.hpp>
 #endif
 
 
 #ifdef __AVX2__
-#include<RAJA/policy/tensor/arch/avx2.hpp>
+#include <RAJA/policy/tensor/arch/avx2.hpp>
 #endif
 
 
 #ifdef __AVX__
-#include<RAJA/policy/tensor/arch/avx.hpp>
+#include <RAJA/policy/tensor/arch/avx.hpp>
 #endif
 
 #ifdef RAJA_CUDA_ACTIVE
-#include<RAJA/policy/tensor/arch/cuda.hpp>
+#include <RAJA/policy/tensor/arch/cuda.hpp>
 #endif
 
 #ifdef RAJA_HIP_ACTIVE
-#include<RAJA/policy/tensor/arch/hip.hpp>
+#include <RAJA/policy/tensor/arch/hip.hpp>
 #endif
 
 // The scalar register is always supported (doesn't require any SIMD/SIMT)
-#include<RAJA/policy/tensor/arch/scalar.hpp>
-
+#include <RAJA/policy/tensor/arch/scalar.hpp>
 
 
 #endif
diff --git a/include/RAJA/policy/tensor/policy.hpp b/include/RAJA/policy/tensor/policy.hpp
index 8618d543b2..0b71c1143b 100644
--- a/include/RAJA/policy/tensor/policy.hpp
+++ b/include/RAJA/policy/tensor/policy.hpp
@@ -40,37 +40,42 @@ namespace policy
 namespace tensor
 {
 
-template<typename EXEC_POLICY, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t TILE_SIZE>
-struct tensor_exec : public EXEC_POLICY {
+template <typename EXEC_POLICY,
+          typename TENSOR_TYPE,
+          camp::idx_t DIM,
+          camp::idx_t TILE_SIZE>
+struct tensor_exec : public EXEC_POLICY
+{
   using exec_policy = EXEC_POLICY;
   using tensor_type = TENSOR_TYPE;
 
   static constexpr camp::idx_t s_tensor_dim = DIM;
-  static constexpr camp::idx_t s_tile_size = TILE_SIZE;
+  static constexpr camp::idx_t s_tile_size  = TILE_SIZE;
 };
 
 
-
 }  // end of namespace tensor
 
 }  // end of namespace policy
 
-namespace expt {
-
-
-template<typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
-using vector_exec = policy::tensor::tensor_exec<RAJA::seq_exec, TENSOR_TYPE, 0, TILE_SIZE>;
+namespace expt
+{
 
-template<typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
-using matrix_row_exec = policy::tensor::tensor_exec<seq_exec, TENSOR_TYPE, 0, TILE_SIZE>;
 
-template<typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
-using matrix_col_exec = policy::tensor::tensor_exec<seq_exec, TENSOR_TYPE, 1, TILE_SIZE>;
+template <typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
+using vector_exec =
+    policy::tensor::tensor_exec<RAJA::seq_exec, TENSOR_TYPE, 0, TILE_SIZE>;
 
+template <typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
+using matrix_row_exec =
+    policy::tensor::tensor_exec<seq_exec, TENSOR_TYPE, 0, TILE_SIZE>;
 
-} //  namespace expt
+template <typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
+using matrix_col_exec =
+    policy::tensor::tensor_exec<seq_exec, TENSOR_TYPE, 1, TILE_SIZE>;
 
 
+}  //  namespace expt
 
 
 }  // end of namespace RAJA
diff --git a/include/RAJA/util/BitMask.hpp b/include/RAJA/util/BitMask.hpp
index 63f011b689..19f1a339ee 100644
--- a/include/RAJA/util/BitMask.hpp
+++ b/include/RAJA/util/BitMask.hpp
@@ -24,61 +24,62 @@
 namespace RAJA
 {
 
-  template<camp::idx_t N>
-  struct LogBase2
-  {
-      static constexpr camp::idx_t value = LogBase2<(N>>1)>::value + 1;
-      static constexpr bool is_exact = ((1<<value) == N);
-  };
-
-  template<>
-  struct LogBase2<0>
-  {
-      static constexpr camp::idx_t value = -1;
-      static constexpr bool is_exact = true;
-  };
+template <camp::idx_t N>
+struct LogBase2
+{
+  static constexpr camp::idx_t value = LogBase2<(N >> 1)>::value + 1;
+  static constexpr bool is_exact     = ((1 << value) == N);
+};
 
-  /*!
-   * A bit-masking operator
-   *
-   * Provides an operator that shifts and masks in input value to extract
-   * a contiguous set of bits.
-   *
-   * result = (input >> Shift) & (Mask)
-   *
-   * Where mask is (1<<Width)-1, or the number of bits defined by Width.
-   *
-   *
-   */
-  template<int Width, int Shift>
-  struct BitMask {
-    static constexpr int shift = Shift;
-    static constexpr int width = Width;
-    static constexpr int max_input_size = 1<<(Shift+Width);
-    static constexpr int max_masked_size = 1<<Width;
-    static constexpr int max_shifted_size = 1<<Shift;
+template <>
+struct LogBase2<0>
+{
+  static constexpr camp::idx_t value = -1;
+  static constexpr bool is_exact     = true;
+};
 
-    template<typename T>
-    RAJA_HOST_DEVICE
-    static constexpr T maskValue(T input) {
-      return( (input>>( static_cast<T>(Shift) )) & static_cast<T>((1<<(Width))-1) );
-    }
+/*!
+ * A bit-masking operator
+ *
+ * Provides an operator that shifts and masks in input value to extract
+ * a contiguous set of bits.
+ *
+ * result = (input >> Shift) & (Mask)
+ *
+ * Where mask is (1<<Width)-1, or the number of bits defined by Width.
+ *
+ *
+ */
+template <int Width, int Shift>
+struct BitMask
+{
+  static constexpr int shift            = Shift;
+  static constexpr int width            = Width;
+  static constexpr int max_input_size   = 1 << (Shift + Width);
+  static constexpr int max_masked_size  = 1 << Width;
+  static constexpr int max_shifted_size = 1 << Shift;
 
+  template <typename T>
+  RAJA_HOST_DEVICE static constexpr T maskValue(T input)
+  {
+    return ((input >> (static_cast<T>(Shift))) &
+            static_cast<T>((1 << (Width)) - 1));
+  }
 
-    template<typename T>
-    RAJA_HOST_DEVICE
-    static constexpr T getOuter(T input) {
-      return(  (input>>(static_cast<T>(Shift))) >> Width );
-    }
 
-    template<typename T>
-    RAJA_HOST_DEVICE
-    static constexpr T maskOuter(T input) {
-      return( input & (static_cast<T>(-1) << (Width+Shift) )  );
-    }
+  template <typename T>
+  RAJA_HOST_DEVICE static constexpr T getOuter(T input)
+  {
+    return ((input >> (static_cast<T>(Shift))) >> Width);
+  }
 
-  };
+  template <typename T>
+  RAJA_HOST_DEVICE static constexpr T maskOuter(T input)
+  {
+    return (input & (static_cast<T>(-1) << (Width + Shift)));
+  }
+};
 
 }  // namespace RAJA
 
-#endif //RAJA_util_BitMask_HPP
+#endif  // RAJA_util_BitMask_HPP
diff --git a/include/RAJA/util/CombiningAdapter.hpp b/include/RAJA/util/CombiningAdapter.hpp
index abe8197b93..b6f1f05dc4 100644
--- a/include/RAJA/util/CombiningAdapter.hpp
+++ b/include/RAJA/util/CombiningAdapter.hpp
@@ -82,11 +82,11 @@ struct CombiningAdapter
 {
   using Layout = Layout_;
 
-  using IndexRange = typename Layout::IndexRange;
+  using IndexRange     = typename Layout::IndexRange;
   using StrippedIdxLin = typename Layout::StrippedIdxLin;
-  using IndexLinear = typename Layout::IndexLinear;
-  using DimTuple = typename Layout::DimTuple;
-  using DimArr = typename Layout::DimArr;
+  using IndexLinear    = typename Layout::IndexLinear;
+  using DimTuple       = typename Layout::DimTuple;
+  using DimArr         = typename Layout::DimArr;
 
   using RangeLinear = RAJA::TypedRangeSegment<IndexLinear>;
 
@@ -95,10 +95,11 @@ struct CombiningAdapter
   Layout m_layout;
 
   RAJA_SUPPRESS_HD_WARN
-  template < camp::idx_t... RangeInts >
+  template <camp::idx_t... RangeInts>
   RAJA_HOST_DEVICE inline auto call_helper(IndexLinear linear_index,
                                            camp::idx_seq<RangeInts...>)
-    -> decltype(m_lambda(camp::val<camp::tuple_element_t<RangeInts, DimTuple>>()...))
+      -> decltype(m_lambda(
+          camp::val<camp::tuple_element_t<RangeInts, DimTuple>>()...))
   {
     DimTuple indices;
     m_layout.toIndices(linear_index, camp::get<RangeInts>(indices)...);
@@ -106,10 +107,11 @@ struct CombiningAdapter
   }
   ///
   RAJA_SUPPRESS_HD_WARN
-  template < camp::idx_t... RangeInts >
+  template <camp::idx_t... RangeInts>
   RAJA_HOST_DEVICE inline auto call_helper(IndexLinear linear_index,
                                            camp::idx_seq<RangeInts...>) const
-    -> decltype(m_lambda(camp::val<camp::tuple_element_t<RangeInts, DimTuple>>()...))
+      -> decltype(m_lambda(
+          camp::val<camp::tuple_element_t<RangeInts, DimTuple>>()...))
   {
     DimTuple indices;
     m_layout.toIndices(linear_index, camp::get<RangeInts>(indices)...);
@@ -117,16 +119,14 @@ struct CombiningAdapter
   }
 
 public:
-
   /*!
    * Constructor from lambda and layout.
    */
-  template < typename C_Lambda, typename C_Layout >
+  template <typename C_Lambda, typename C_Layout>
   RAJA_HOST_DEVICE CombiningAdapter(C_Lambda&& lambda, C_Layout&& layout)
-      : m_lambda(std::forward<C_Lambda>(lambda))
-      , m_layout(std::forward<C_Layout>(layout))
-  {
-  }
+      : m_lambda(std::forward<C_Lambda>(lambda)),
+        m_layout(std::forward<C_Layout>(layout))
+  {}
 
   /*!
    * Call the lambda by converting the linear index to multidimensional indices.
@@ -134,13 +134,13 @@ struct CombiningAdapter
    * @return return value of lambda
    */
   RAJA_HOST_DEVICE RAJA_INLINE auto operator()(IndexLinear linear_index)
-    -> decltype(call_helper(linear_index, IndexRange()))
+      -> decltype(call_helper(linear_index, IndexRange()))
   {
     return call_helper(linear_index, IndexRange());
   }
   ///
   RAJA_HOST_DEVICE RAJA_INLINE auto operator()(IndexLinear linear_index) const
-    -> decltype(call_helper(linear_index, IndexRange()))
+      -> decltype(call_helper(linear_index, IndexRange()))
   {
     return call_helper(linear_index, IndexRange());
   }
@@ -207,9 +207,9 @@ struct CombiningAdapter
  *
  */
 template <typename Lambda, typename Layout>
-RAJA_HOST_DEVICE RAJA_INLINE
-auto make_CombiningAdapter_from_layout(Lambda&& lambda, Layout&& layout)
-  // -> CombiningAdapter<camp::decay<Lambda>, camp::decay<Layout>>
+RAJA_HOST_DEVICE RAJA_INLINE auto
+make_CombiningAdapter_from_layout(Lambda&& lambda, Layout&& layout)
+// -> CombiningAdapter<camp::decay<Lambda>, camp::decay<Layout>>
 {
   return CombiningAdapter<camp::decay<Lambda>, camp::decay<Layout>>(
       std::forward<Lambda>(lambda), std::forward<Layout>(layout));
@@ -217,48 +217,54 @@ auto make_CombiningAdapter_from_layout(Lambda&& lambda, Layout&& layout)
 ///
 RAJA_SUPPRESS_HD_WARN
 template <typename Lambda, typename... IdxTs>
-RAJA_INLINE
-auto make_CombiningAdapter(Lambda&& lambda, ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
-  // -> decltype(make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
-  //             camp::val<RAJA::TypedOffsetLayout<
-  //                 typename std::common_type< strip_index_type_t<IdxTs>... >::type,
-  //                 IdxTs...>>()))
+RAJA_INLINE auto
+make_CombiningAdapter(Lambda&& lambda,
+                      ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
+// -> decltype(make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
+//             camp::val<RAJA::TypedOffsetLayout<
+//                 typename std::common_type< strip_index_type_t<IdxTs>...
+//                 >::type, IdxTs...>>()))
 {
-  using std::begin; using std::end; using std::distance;
-  using IdxLin = typename std::common_type< strip_index_type_t<IdxTs>... >::type;
+  using std::begin;
+  using std::distance;
+  using std::end;
+  using IdxLin = typename std::common_type<strip_index_type_t<IdxTs>...>::type;
   using Layout = RAJA::Layout<sizeof...(IdxTs), IdxLin>;
   using OffsetLayout = RAJA::TypedOffsetLayout<IdxLin, camp::tuple<IdxTs...>>;
 
   Layout layout(static_cast<IdxLin>(distance(begin(segs), end(segs)))...);
   OffsetLayout offset_layout = OffsetLayout::from_layout_and_offsets(
-        {{(distance(begin(segs), end(segs)) ? static_cast<IdxLin>(*begin(segs))
-                                            : static_cast<IdxLin>(0))...}},
-        std::move(layout));
+      {{(distance(begin(segs), end(segs)) ? static_cast<IdxLin>(*begin(segs))
+                                          : static_cast<IdxLin>(0))...}},
+      std::move(layout));
   return make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
                                            std::move(offset_layout));
 }
 ///
 RAJA_SUPPRESS_HD_WARN
 template <typename Perm, typename Lambda, typename... IdxTs>
-RAJA_INLINE
-auto make_PermutedCombiningAdapter(Lambda&& lambda, ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
-  // -> decltype(make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
-  //             camp::val<RAJA::TypedOffsetLayout<
-  //                 typename std::common_type< strip_index_type_t<IdxTs>... >::type,
-  //                 IdxTs...>>()))
+RAJA_INLINE auto
+make_PermutedCombiningAdapter(Lambda&& lambda,
+                              ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
+// -> decltype(make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
+//             camp::val<RAJA::TypedOffsetLayout<
+//                 typename std::common_type< strip_index_type_t<IdxTs>...
+//                 >::type, IdxTs...>>()))
 {
-  using std::begin; using std::end; using std::distance;
-  using IdxLin = typename std::common_type< strip_index_type_t<IdxTs>... >::type;
+  using std::begin;
+  using std::distance;
+  using std::end;
+  using IdxLin = typename std::common_type<strip_index_type_t<IdxTs>...>::type;
   using OffsetLayout = RAJA::TypedOffsetLayout<IdxLin, camp::tuple<IdxTs...>>;
 
   auto layout = make_permuted_layout<sizeof...(IdxTs), IdxLin>(
-              {{static_cast<IdxLin>(distance(begin(segs), end(segs)))...}},
-              RAJA::as_array<Perm>::get());
+      {{static_cast<IdxLin>(distance(begin(segs), end(segs)))...}},
+      RAJA::as_array<Perm>::get());
   OffsetLayout offset_layout = OffsetLayout::from_layout_and_offsets(
-        {{(distance(begin(segs), end(segs)) ? static_cast<IdxLin>(*begin(segs))
-                                            : static_cast<IdxLin>(0))...}},
+      {{(distance(begin(segs), end(segs)) ? static_cast<IdxLin>(*begin(segs))
+                                          : static_cast<IdxLin>(0))...}},
 
-        std::move(layout));
+      std::move(layout));
   return make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
                                            std::move(offset_layout));
 }
diff --git a/include/RAJA/util/EnableIf.hpp b/include/RAJA/util/EnableIf.hpp
index 257e852bf9..db0928385e 100644
--- a/include/RAJA/util/EnableIf.hpp
+++ b/include/RAJA/util/EnableIf.hpp
@@ -41,14 +41,16 @@ struct is_any_of;
 
 template <typename T, typename... Types>
 struct is_any_of<T, ::camp::list<Types...>>
-  : ::RAJA::concepts::any_of<::camp::is_same<T, Types>...>
+    : ::RAJA::concepts::any_of<::camp::is_same<T, Types>...>
 {};
 
 template <typename T, typename TypeList>
 using enable_if_is_any_of = std::enable_if_t<is_any_of<T, TypeList>::value, T>;
 
 template <typename T, typename TypeList>
-using enable_if_is_none_of = std::enable_if_t<::RAJA::concepts::negate<is_any_of<T, TypeList>>::value, T>;
+using enable_if_is_none_of =
+    std::enable_if_t<::RAJA::concepts::negate<is_any_of<T, TypeList>>::value,
+                     T>;
 
 
 }  // namespace util
diff --git a/include/RAJA/util/IndexLayout.hpp b/include/RAJA/util/IndexLayout.hpp
index 6bb308d375..005f26b337 100644
--- a/include/RAJA/util/IndexLayout.hpp
+++ b/include/RAJA/util/IndexLayout.hpp
@@ -3,7 +3,8 @@
  *
  * \file
  *
- * \brief   RAJA header file defining the IndexLayout class and IndexList classes.
+ * \brief   RAJA header file defining the IndexLayout class and IndexList
+ *classes.
  *
  ******************************************************************************
  */
@@ -20,73 +21,83 @@
 
 #include "RAJA/util/Layout.hpp"
 
-namespace RAJA 
+namespace RAJA
 {
 
 /*!
-* DirectIndex struct contains call operator that returns the same index that was input
-*
-*/
-template<typename IdxLin = Index_type>
-struct DirectIndex {
+ * DirectIndex struct contains call operator that returns the same index that
+ * was input
+ *
+ */
+template <typename IdxLin = Index_type>
+struct DirectIndex
+{
 
-  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr operator()(const IdxLin idx) const
+  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr
+  operator()(const IdxLin idx) const
   {
     return idx;
   }
-
 };
 
 /*!
-* IndexList struct stores a pointer to an array containing the index list.
-* Its call operator returns the entry at the input location (idx) of its index list.
-* 
-*/
-template<typename IdxLin = Index_type>
-struct IndexList {
+ * IndexList struct stores a pointer to an array containing the index list.
+ * Its call operator returns the entry at the input location (idx) of its index
+ * list.
+ *
+ */
+template <typename IdxLin = Index_type>
+struct IndexList
+{
 
-  IdxLin* index_list{nullptr};
+  IdxLin* index_list {nullptr};
 
-  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr operator()(const IdxLin idx) const
+  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr
+  operator()(const IdxLin idx) const
   {
     return index_list[idx];
   }
-
 };
 
 /*!
-* ConditionalIndexList struct stores a pointer to an array containing the index list.
-* Its call operator returns the same index that was input if the index list is a nullptr, 
-* or otherwise returns the entry at the input location (idx) of its index list.
-* 
-*/
-template<typename IdxLin = Index_type>
-struct ConditionalIndexList {
+ * ConditionalIndexList struct stores a pointer to an array containing the index
+ * list. Its call operator returns the same index that was input if the index
+ * list is a nullptr, or otherwise returns the entry at the input location (idx)
+ * of its index list.
+ *
+ */
+template <typename IdxLin = Index_type>
+struct ConditionalIndexList
+{
 
-  IdxLin* index_list{nullptr};  
+  IdxLin* index_list {nullptr};
 
-  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr operator()(const IdxLin idx) const
+  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr
+  operator()(const IdxLin idx) const
   {
-    if (index_list) {
+    if (index_list)
+    {
       return index_list[idx];
-    } else {
+    }
+    else
+    {
       return idx;
     }
   }
-
 };
 
 namespace internal
 {
 
-template<typename Range, typename IdxLin, typename... IndexTypes>
+template <typename Range, typename IdxLin, typename... IndexTypes>
 struct IndexLayout_impl;
 
 template <camp::idx_t... RangeInts, typename IdxLin, typename... IndexTypes>
-struct IndexLayout_impl<camp::idx_seq<RangeInts...>, IdxLin, IndexTypes...> {
-  using IndexRange = camp::idx_seq<RangeInts...>;
+struct IndexLayout_impl<camp::idx_seq<RangeInts...>, IdxLin, IndexTypes...>
+{
+  using IndexRange  = camp::idx_seq<RangeInts...>;
   using IndexLinear = IdxLin;
-  using Base = RAJA::detail::LayoutBase_impl<IndexRange, IdxLin>;
+  using Base        = RAJA::detail::LayoutBase_impl<IndexRange, IdxLin>;
   Base base_;
 
   static constexpr size_t n_dims = sizeof...(RangeInts);
@@ -94,76 +105,78 @@ struct IndexLayout_impl<camp::idx_seq<RangeInts...>, IdxLin, IndexTypes...> {
   camp::tuple<IndexTypes...> tuple;
 
   template <typename... Types>
-  constexpr RAJA_INLINE IndexLayout_impl(
-      camp::tuple<IndexTypes...> index_tuple_in,
-      Types... ns)
-      : base_{(ns)...},
-        tuple(index_tuple_in)
-  {
-  }
+  constexpr RAJA_INLINE
+  IndexLayout_impl(camp::tuple<IndexTypes...> index_tuple_in, Types... ns)
+      : base_ {(ns)...}, tuple(index_tuple_in)
+  {}
 
   /*!
    * Computes a linear space index from entries of index lists stored in tuple.
-   * This is accomplished through the inner product of the strides and the 
+   * This is accomplished through the inner product of the strides and the
    * entry in the index list along each dimension.
    * @param indices Indices in the n-dimensional space of this layout
    * @return Linear space index.
-   */  
+   */
   template <typename... Indices>
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(
-      Indices... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin
+  operator()(Indices... indices) const
   {
     return sum<IdxLin>(
-      (base_.strides[RangeInts] * camp::get<RangeInts>(tuple)(indices))...);
+        (base_.strides[RangeInts] * camp::get<RangeInts>(tuple)(indices))...);
   }
-
 };
 
-} // namespace internal
+}  // namespace internal
 
 
-template <size_t n_dims = 1, typename IdxLin = Index_type, typename... IndexTypes>
+template <size_t n_dims   = 1,
+          typename IdxLin = Index_type,
+          typename... IndexTypes>
 struct IndexLayout
-    : public internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...> {
-  using Base =
-      internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...>;
+    : public internal::
+          IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...>
+{
+  using Base = internal::
+      IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...>;
 
   using internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>,
-                                    IdxLin, IndexTypes...>::IndexLayout_impl;
-
-  constexpr RAJA_INLINE RAJA_HOST_DEVICE IndexLayout(
-      const internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...>&
-          rhs)
-      : Base{rhs}
-  {
-  }
-
+                                   IdxLin,
+                                   IndexTypes...>::IndexLayout_impl;
+
+  constexpr RAJA_INLINE RAJA_HOST_DEVICE
+  IndexLayout(const internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>,
+                                               IdxLin,
+                                               IndexTypes...>& rhs)
+      : Base {rhs}
+  {}
 };
 
 /*!
- * creates of a camp::tuple of index types 
+ * creates of a camp::tuple of index types
  * (such as DirectIndex, IndexList, or ConditionalIndexList)
  *
  */
 template <typename... IndexTypes>
 auto make_index_tuple(IndexTypes... it) -> camp::tuple<IndexTypes...>
 {
-    return camp::tuple<IndexTypes...>(it...);
+  return camp::tuple<IndexTypes...>(it...);
 }
 
 /*!
  * creates an index layout based on the input camp::tuple of index types
  *
- */  
-template <typename IdxLin = Index_type, typename... Types, typename... IndexTypes>
-auto make_index_layout(
-  camp::tuple<IndexTypes...> index_tuple_in,
-  Types... ns) -> IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>
+ */
+template <typename IdxLin = Index_type,
+          typename... Types,
+          typename... IndexTypes>
+auto make_index_layout(camp::tuple<IndexTypes...> index_tuple_in, Types... ns)
+    -> IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>
 {
-    static_assert(sizeof...(Types) == sizeof...(IndexTypes), "");
-    return IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>(index_tuple_in, ns...);
+  static_assert(sizeof...(Types) == sizeof...(IndexTypes), "");
+  return IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>(index_tuple_in,
+                                                              ns...);
 }
 
-}
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/KokkosPluginLoader.hpp b/include/RAJA/util/KokkosPluginLoader.hpp
index c5060a0a96..7812306b71 100644
--- a/include/RAJA/util/KokkosPluginLoader.hpp
+++ b/include/RAJA/util/KokkosPluginLoader.hpp
@@ -14,39 +14,44 @@
 #include "RAJA/util/PluginOptions.hpp"
 #include "RAJA/util/PluginStrategy.hpp"
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
-  class KokkosPluginLoader : public ::RAJA::util::PluginStrategy
-  {
-  public:
-    using Parent = ::RAJA::util::PluginStrategy;
-    typedef void (*init_function)(const int, const uint64_t, const uint32_t, void*);
-    typedef void (*pre_function)(const char*, const uint32_t, uint64_t*);
-    typedef void (*post_function)(uint64_t);
-    typedef void (*finalize_function)();
+class KokkosPluginLoader : public ::RAJA::util::PluginStrategy
+{
+public:
+  using Parent = ::RAJA::util::PluginStrategy;
+  typedef void (*init_function)(const int,
+                                const uint64_t,
+                                const uint32_t,
+                                void*);
+  typedef void (*pre_function)(const char*, const uint32_t, uint64_t*);
+  typedef void (*post_function)(uint64_t);
+  typedef void (*finalize_function)();
 
-    KokkosPluginLoader();
+  KokkosPluginLoader();
 
-    void preLaunch(const RAJA::util::PluginContext& p) override;
+  void preLaunch(const RAJA::util::PluginContext& p) override;
 
-    void postLaunch(const RAJA::util::PluginContext& p) override;
+  void postLaunch(const RAJA::util::PluginContext& p) override;
 
-    void finalize() override;
+  void finalize() override;
 
-  private:
-    void initPlugin(const std::string &path);
-    
-    void initDirectory(const std::string &path);
+private:
+  void initPlugin(const std::string& path);
 
-    std::vector<init_function> init_functions;
-    std::vector<pre_function> pre_functions;
-    std::vector<post_function> post_functions;
-    std::vector<finalize_function> finalize_functions;
+  void initDirectory(const std::string& path);
 
-  };  // end KokkosPluginLoader class
+  std::vector<init_function> init_functions;
+  std::vector<pre_function> pre_functions;
+  std::vector<post_function> post_functions;
+  std::vector<finalize_function> finalize_functions;
 
-  void linkKokkosPluginLoader();
+};  // end KokkosPluginLoader class
+
+void linkKokkosPluginLoader();
 
 }  // end namespace util
 }  // end namespace RAJA
diff --git a/include/RAJA/util/Layout.hpp b/include/RAJA/util/Layout.hpp
index 948e37f498..30a044e322 100644
--- a/include/RAJA/util/Layout.hpp
+++ b/include/RAJA/util/Layout.hpp
@@ -38,9 +38,8 @@ namespace detail
 {
 
 
-
 template <typename Range,
-          typename IdxLin = Index_type,
+          typename IdxLin        = Index_type,
           ptrdiff_t StrideOneDim = -1>
 struct LayoutBase_impl;
 
@@ -49,63 +48,62 @@ struct LayoutBase_impl;
  */
 
 template <size_t j, size_t n_dims, typename IdxLin = Index_type>
-struct stride_calculator {
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(
-      IdxLin cur_stride,
-      IdxLin const (&sizes)[n_dims]) const
+struct stride_calculator
+{
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin
+  operator()(IdxLin cur_stride, IdxLin const (&sizes)[n_dims]) const
   {
-    return stride_calculator<j + 1, n_dims, IdxLin>{}(
+    return stride_calculator<j + 1, n_dims, IdxLin> {}(
         cur_stride * (sizes[j] ? sizes[j] : 1), sizes);
   }
 };
 template <size_t n_dims, typename IdxLin>
-struct stride_calculator<n_dims, n_dims, IdxLin> {
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(
-      IdxLin cur_stride,
-      IdxLin const (&)[n_dims]) const
+struct stride_calculator<n_dims, n_dims, IdxLin>
+{
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin
+  operator()(IdxLin cur_stride, IdxLin const (&)[n_dims]) const
   {
     return cur_stride;
   }
 };
 
 template <camp::idx_t... RangeInts, typename IdxLin, ptrdiff_t StrideOneDim>
-struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
+struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
+{
 public:
   using IndexLinear = IdxLin;
-  using IndexRange = camp::make_idx_seq_t<sizeof...(RangeInts)>;
+  using IndexRange  = camp::make_idx_seq_t<sizeof...(RangeInts)>;
 
   static constexpr size_t n_dims = sizeof...(RangeInts);
-  static constexpr IdxLin limit = RAJA::operators::limits<IdxLin>::max();
+  static constexpr IdxLin limit  = RAJA::operators::limits<IdxLin>::max();
   static constexpr ptrdiff_t stride_one_dim = StrideOneDim;
 
-  IdxLin sizes[n_dims] = {0};
-  IdxLin strides[n_dims] = {0};
+  IdxLin sizes[n_dims]       = {0};
+  IdxLin strides[n_dims]     = {0};
   IdxLin inv_strides[n_dims] = {0};
-  IdxLin inv_mods[n_dims] = {0};
+  IdxLin inv_mods[n_dims]    = {0};
 
 
   /*!
    * Default constructor with zero sizes and strides.
    */
-  constexpr RAJA_INLINE LayoutBase_impl() = default;
-  constexpr RAJA_INLINE LayoutBase_impl(LayoutBase_impl const &) = default;
-  constexpr RAJA_INLINE LayoutBase_impl(LayoutBase_impl &&) = default;
-  RAJA_INLINE LayoutBase_impl &operator=(LayoutBase_impl const &) =
-      default;
-  RAJA_INLINE LayoutBase_impl &operator=(LayoutBase_impl &&) =
-      default;
+  constexpr RAJA_INLINE LayoutBase_impl()                        = default;
+  constexpr RAJA_INLINE LayoutBase_impl(LayoutBase_impl const&)  = default;
+  constexpr RAJA_INLINE LayoutBase_impl(LayoutBase_impl&&)       = default;
+  RAJA_INLINE LayoutBase_impl& operator=(LayoutBase_impl const&) = default;
+  RAJA_INLINE LayoutBase_impl& operator=(LayoutBase_impl&&)      = default;
 
   /*!
    * Construct a layout given the size of each dimension.
    */
   template <typename... Types>
   RAJA_INLINE RAJA_HOST_DEVICE constexpr LayoutBase_impl(Types... ns)
-      : sizes{static_cast<IdxLin>(stripIndexType(ns))...},
-        strides{(detail::stride_calculator<RangeInts + 1, n_dims, IdxLin>{}(
+      : sizes {static_cast<IdxLin>(stripIndexType(ns))...},
+        strides {(detail::stride_calculator<RangeInts + 1, n_dims, IdxLin> {}(
             sizes[RangeInts] ? IdxLin(1) : IdxLin(0),
             sizes))...},
-        inv_strides{(strides[RangeInts] ? strides[RangeInts] : IdxLin(1))...},
-        inv_mods{(sizes[RangeInts] ? sizes[RangeInts] : IdxLin(1))...}
+        inv_strides {(strides[RangeInts] ? strides[RangeInts] : IdxLin(1))...},
+        inv_mods {(sizes[RangeInts] ? sizes[RangeInts] : IdxLin(1))...}
   {
     static_assert(n_dims == sizeof...(Types),
                   "number of dimensions must match");
@@ -115,15 +113,15 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
    *  Templated copy ctor from simillar layout.
    */
   template <typename CIdxLin, ptrdiff_t CStrideOneDim>
-  constexpr RAJA_INLINE RAJA_HOST_DEVICE LayoutBase_impl(
-      const LayoutBase_impl<camp::idx_seq<RangeInts...>, CIdxLin, CStrideOneDim>
-          &rhs)
-      : sizes{static_cast<IdxLin>(rhs.sizes[RangeInts])...},
-        strides{static_cast<IdxLin>(rhs.strides[RangeInts])...},
-        inv_strides{static_cast<IdxLin>(rhs.inv_strides[RangeInts])...},
-        inv_mods{static_cast<IdxLin>(rhs.inv_mods[RangeInts])...}
-  {
-  }
+  constexpr RAJA_INLINE RAJA_HOST_DEVICE
+  LayoutBase_impl(const LayoutBase_impl<camp::idx_seq<RangeInts...>,
+                                        CIdxLin,
+                                        CStrideOneDim>& rhs)
+      : sizes {static_cast<IdxLin>(rhs.sizes[RangeInts])...},
+        strides {static_cast<IdxLin>(rhs.strides[RangeInts])...},
+        inv_strides {static_cast<IdxLin>(rhs.inv_strides[RangeInts])...},
+        inv_mods {static_cast<IdxLin>(rhs.inv_mods[RangeInts])...}
+  {}
 
 
   /*!
@@ -131,36 +129,35 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
    */
   template <typename... Types>
   RAJA_INLINE constexpr LayoutBase_impl(
-      const std::array<IdxLin, n_dims> &sizes_in,
-      const std::array<IdxLin, n_dims> &strides_in)
-      : sizes{sizes_in[RangeInts]...},
-        strides{strides_in[RangeInts]...},
-        inv_strides{(strides[RangeInts] ? strides[RangeInts] : IdxLin(1))...},
-        inv_mods{(sizes[RangeInts] ? sizes[RangeInts] : IdxLin(1))...}
-  {
-  }
+      const std::array<IdxLin, n_dims>& sizes_in,
+      const std::array<IdxLin, n_dims>& strides_in)
+      : sizes {sizes_in[RangeInts]...},
+        strides {strides_in[RangeInts]...},
+        inv_strides {(strides[RangeInts] ? strides[RangeInts] : IdxLin(1))...},
+        inv_mods {(sizes[RangeInts] ? sizes[RangeInts] : IdxLin(1))...}
+  {}
 
   /*!
    * Methods to performs bounds checking in layout objects
    */
-  template<camp::idx_t N, typename Idx>
+  template <camp::idx_t N, typename Idx>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheckError(Idx idx) const
   {
     printf("Error at index %d, value %ld is not within bounds [0, %ld] \n",
-           static_cast<int>(N), static_cast<long int>(idx), static_cast<long int>(sizes[N] - 1));
+           static_cast<int>(N), static_cast<long int>(idx),
+           static_cast<long int>(sizes[N] - 1));
     RAJA_ABORT_OR_THROW("Out of bounds error \n");
   }
 
   template <camp::idx_t N>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck() const
-  {
-  }
+  {}
 
   template <camp::idx_t N, typename Idx, typename... Indices>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck(Idx idx,
                                                 Indices... indices) const
   {
-    if(sizes[N] > 0 && !(0<=idx && idx < static_cast<Idx>(sizes[N])))
+    if (sizes[N] > 0 && !(0 <= idx && idx < static_cast<Idx>(sizes[N])))
     {
       BoundsCheckError<N>(idx);
     }
@@ -180,16 +177,16 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
   RAJA_INLINE RAJA_HOST_DEVICE RAJA_BOUNDS_CHECK_constexpr IdxLin
   operator()(Indices... indices) const
   {
-#if defined (RAJA_BOUNDS_CHECK_INTERNAL)
+#if defined(RAJA_BOUNDS_CHECK_INTERNAL)
     BoundsCheck<0>(indices...);
 #endif
     // dot product of strides and indices
-    return sum<IdxLin>(
-      (RangeInts==stride_one_dim ?   // Is this dimension stride-one?
-         indices :  // it's stride one, so dont bother with multiply
-         strides[RangeInts]*indices // it's not stride one
-			)...
-    );
+    return sum<IdxLin>((RangeInts == stride_one_dim
+                            ?  // Is this dimension stride-one?
+                            indices
+                            :  // it's stride one, so dont bother with multiply
+                            strides[RangeInts] * indices  // it's not stride one
+                        )...);
   }
 
 
@@ -205,20 +202,22 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
    */
   template <typename... Indices>
   RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
-                                              Indices &&... indices) const
+                                              Indices&&... indices) const
   {
 #if defined(RAJA_BOUNDS_CHECK_INTERNAL)
     IdxLin totSize = size_noproj();
-    if(totSize > 0 && (linear_index < 0 || linear_index >= totSize)) {
+    if (totSize > 0 && (linear_index < 0 || linear_index >= totSize))
+    {
       printf("Error! Linear index %ld is not within bounds [0, %ld]. \n",
-             static_cast<long int>(linear_index), static_cast<long int>(totSize-1));
+             static_cast<long int>(linear_index),
+             static_cast<long int>(totSize - 1));
       RAJA_ABORT_OR_THROW("Out of bounds error \n");
-     }
+    }
 #endif
 
-    camp::sink((indices =
-      (camp::decay<Indices>)((linear_index / inv_strides[RangeInts]) %
-                             inv_mods[RangeInts]))...);
+    camp::sink((indices = (camp::decay<Indices>)((linear_index /
+                                                  inv_strides[RangeInts]) %
+                                                 inv_mods[RangeInts]))...);
   }
 
   /*!
@@ -231,8 +230,9 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
   {
     // Multiply together all of the sizes,
     // replacing 1 for any zero-sized dimensions
-    return foldl(RAJA::operators::multiplies<IdxLin>(),
-                         (sizes[RangeInts] == IdxLin(0) ? IdxLin(1) : sizes[RangeInts])...);
+    return foldl(
+        RAJA::operators::multiplies<IdxLin>(),
+        (sizes[RangeInts] == IdxLin(0) ? IdxLin(1) : sizes[RangeInts])...);
   }
 
   /*!
@@ -247,27 +247,21 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
     return foldl(RAJA::operators::multiplies<IdxLin>(), sizes[RangeInts]...);
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_stride() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_stride() const
+  {
     return strides[DIM];
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_size() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_size() const
+  {
     return sizes[DIM];
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_begin() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_begin() const
+  {
     return 0;
   }
 };
@@ -338,11 +332,12 @@ struct TypedLayout;
 
 template <typename IdxLin, typename... DimTypes, ptrdiff_t StrideOne>
 struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
-    : public Layout<sizeof...(DimTypes), strip_index_type_t<IdxLin>, StrideOne> {
+    : public Layout<sizeof...(DimTypes), strip_index_type_t<IdxLin>, StrideOne>
+{
 
   using StrippedIdxLin = strip_index_type_t<IdxLin>;
-  using Self = TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>;
-  using Base = Layout<sizeof...(DimTypes), StrippedIdxLin, StrideOne>;
+  using Self   = TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>;
+  using Base   = Layout<sizeof...(DimTypes), StrippedIdxLin, StrideOne>;
   using DimArr = std::array<StrippedIdxLin, sizeof...(DimTypes)>;
 
   // Pull in base constructors
@@ -356,8 +351,8 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
    * @param indices  Indices in the n-dimensional space of this layout
    * @return Linear space index.
    */
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(
-      DimTypes... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin
+  operator()(DimTypes... indices) const
   {
     return IdxLin(Base::operator()(stripIndexType(indices)...));
   }
@@ -374,11 +369,11 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
    *                 dimensionality of this layout.
    */
   RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
-                                              DimTypes &... indices) const
+                                              DimTypes&... indices) const
   {
-    toIndicesHelper(camp::make_idx_seq_t<sizeof...(DimTypes)>{},
+    toIndicesHelper(camp::make_idx_seq_t<sizeof...(DimTypes)> {},
                     std::forward<IdxLin>(linear_index),
-                    std::forward<DimTypes &>(indices)...);
+                    std::forward<DimTypes&>(indices)...);
   }
 
 private:
@@ -392,11 +387,12 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
   template <typename... Indices, camp::idx_t... RangeInts>
   RAJA_INLINE RAJA_HOST_DEVICE void toIndicesHelper(camp::idx_seq<RangeInts...>,
                                                     IdxLin linear_index,
-                                                    Indices &... indices) const
+                                                    Indices&... indices) const
   {
     StrippedIdxLin locals[sizeof...(DimTypes)];
     Base::toIndices(stripIndexType(linear_index), locals[RangeInts]...);
-		camp::sink((indices = Indices{static_cast<Indices>(locals[RangeInts])})...);
+    camp::sink(
+        (indices = Indices {static_cast<Indices>(locals[RangeInts])})...);
   }
 };
 
@@ -406,8 +402,8 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
  *
  */
 template <ptrdiff_t s1_dim, size_t n_dims, typename IdxLin>
-RAJA_INLINE Layout<n_dims, IdxLin, s1_dim> make_stride_one(
-    Layout<n_dims, IdxLin> const &l)
+RAJA_INLINE Layout<n_dims, IdxLin, s1_dim>
+make_stride_one(Layout<n_dims, IdxLin> const& l)
 {
   return Layout<n_dims, IdxLin, s1_dim>(l);
 }
@@ -418,12 +414,12 @@ RAJA_INLINE Layout<n_dims, IdxLin, s1_dim> make_stride_one(
  *
  */
 template <ptrdiff_t s1_dim, typename IdxLin, typename IdxTuple>
-RAJA_INLINE TypedLayout<IdxLin, IdxTuple, s1_dim> make_stride_one(
-    TypedLayout<IdxLin, IdxTuple> const &l)
+RAJA_INLINE TypedLayout<IdxLin, IdxTuple, s1_dim>
+make_stride_one(TypedLayout<IdxLin, IdxTuple> const& l)
 {
   // strip l to it's base-class type
-  using Base = typename TypedLayout<IdxLin, IdxTuple>::Base;
-  Base const &b = (Base const &)l;
+  using Base    = typename TypedLayout<IdxLin, IdxTuple>::Base;
+  Base const& b = (Base const&)l;
 
   // Use non-typed layout to initialize new typed layout
   return TypedLayout<IdxLin, IdxTuple, s1_dim>(b);
diff --git a/include/RAJA/util/LocalArray.hpp b/include/RAJA/util/LocalArray.hpp
index 50680101d4..faa5910704 100644
--- a/include/RAJA/util/LocalArray.hpp
+++ b/include/RAJA/util/LocalArray.hpp
@@ -31,8 +31,7 @@ namespace RAJA
 {
 
 
-
-template<camp::idx_t ... Sizes>
+template <camp::idx_t... Sizes>
 using ParamList = camp::idx_seq<Sizes...>;
 
 /*!
@@ -51,79 +50,86 @@ using ParamList = camp::idx_seq<Sizes...>;
  */
 
 
-namespace internal {
-
-
+namespace internal
+{
 
-  template<typename Perm, typename Sizes>
-  struct StaticLayoutHelper;
 
-  template<camp::idx_t ... Perm, Index_type ...Sizes>
-  struct StaticLayoutHelper<camp::idx_seq<Perm...>, SizeList<Sizes...>>{
-      using type =  StaticLayout<camp::idx_seq<Perm...>, Sizes...>;
-  };
+template <typename Perm, typename Sizes>
+struct StaticLayoutHelper;
 
-  template<typename Perm, typename Sizes>
-  using getStaticLayoutType = typename StaticLayoutHelper<Perm, Sizes>::type;
+template <camp::idx_t... Perm, Index_type... Sizes>
+struct StaticLayoutHelper<camp::idx_seq<Perm...>, SizeList<Sizes...>>
+{
+  using type = StaticLayout<camp::idx_seq<Perm...>, Sizes...>;
+};
 
+template <typename Perm, typename Sizes>
+using getStaticLayoutType = typename StaticLayoutHelper<Perm, Sizes>::type;
 
 
-}
+}  // namespace internal
 
 
-template<typename ValueType, typename Perm, typename Sizes, typename... IndexTypes>
+template <typename ValueType,
+          typename Perm,
+          typename Sizes,
+          typename... IndexTypes>
 using TypedLocalArray =
-    internal::TypedViewBase<ValueType, ValueType *, internal::getStaticLayoutType<Perm, Sizes>, camp::list<IndexTypes...> >;
+    internal::TypedViewBase<ValueType,
+                            ValueType*,
+                            internal::getStaticLayoutType<Perm, Sizes>,
+                            camp::list<IndexTypes...>>;
 
 
-template<typename ValueType, typename Perm, typename Sizes>
+template <typename ValueType, typename Perm, typename Sizes>
 using LocalArray =
-    internal::TypedViewBase<ValueType, ValueType *, internal::getStaticLayoutType<Perm, Sizes>, internal::getDefaultIndexTypes<Perm> >;
-
-
-
-
-
-template<typename AtomicPolicy, typename DataType, typename Perm,
-         typename Sizes, typename ... IndexTypes>
-struct AtomicTypedLocalArray {
-};
-
-template<typename AtomicPolicy, typename DataType, camp::idx_t ... Perm,
-          Index_type ... Sizes, typename ... IndexTypes>
-struct AtomicTypedLocalArray<AtomicPolicy, DataType, camp::idx_seq<Perm ...>,
-                             RAJA::SizeList<Sizes ...>, IndexTypes ...>{
-  DataType *m_arrayPtr = nullptr;
-  using value_type = DataType;
-  using atomic_ref_t = RAJA::AtomicRef<value_type, AtomicPolicy>;
-  using layout_type = RAJA::StaticLayout<camp::idx_seq<Perm ...>, Sizes ...>;
+    internal::TypedViewBase<ValueType,
+                            ValueType*,
+                            internal::getStaticLayoutType<Perm, Sizes>,
+                            internal::getDefaultIndexTypes<Perm>>;
+
+
+template <typename AtomicPolicy,
+          typename DataType,
+          typename Perm,
+          typename Sizes,
+          typename... IndexTypes>
+struct AtomicTypedLocalArray
+{};
+
+template <typename AtomicPolicy,
+          typename DataType,
+          camp::idx_t... Perm,
+          Index_type... Sizes,
+          typename... IndexTypes>
+struct AtomicTypedLocalArray<AtomicPolicy,
+                             DataType,
+                             camp::idx_seq<Perm...>,
+                             RAJA::SizeList<Sizes...>,
+                             IndexTypes...>
+{
+  DataType* m_arrayPtr = nullptr;
+  using value_type     = DataType;
+  using atomic_ref_t   = RAJA::AtomicRef<value_type, AtomicPolicy>;
+  using layout_type    = RAJA::StaticLayout<camp::idx_seq<Perm...>, Sizes...>;
   static const camp::idx_t NumElem = layout_type::s_size;
 
   RAJA_HOST_DEVICE
-  atomic_ref_t operator()(IndexTypes ... indices) const
+  atomic_ref_t operator()(IndexTypes... indices) const
   {
-    return(atomic_ref_t(&m_arrayPtr[layout_type::s_oper(stripIndexType(indices)
-                                                     ...)]));
+    return (atomic_ref_t(
+        &m_arrayPtr[layout_type::s_oper(stripIndexType(indices)...)]));
   }
 
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  constexpr
-  camp::idx_t size() const
-  {
-    return layout_type::s_size;
-  }
+  constexpr camp::idx_t size() const { return layout_type::s_size; }
 
   RAJA_HOST_DEVICE
-  RAJA_INLINE void set_data(DataType * data_ptr){
-    m_arrayPtr = data_ptr;
-  }
+  RAJA_INLINE void set_data(DataType* data_ptr) { m_arrayPtr = data_ptr; }
 };
 
 
-
-
-
 }  // end namespace RAJA
 
 
diff --git a/include/RAJA/util/OffsetLayout.hpp b/include/RAJA/util/OffsetLayout.hpp
index 827515062e..858f444f74 100644
--- a/include/RAJA/util/OffsetLayout.hpp
+++ b/include/RAJA/util/OffsetLayout.hpp
@@ -41,66 +41,67 @@ template <typename Range, typename IdxLin>
 struct OffsetLayout_impl;
 
 template <camp::idx_t... RangeInts, typename IdxLin>
-struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
-  using Self = OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>;
-  using IndexRange = camp::idx_seq<RangeInts...>;
+struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>
+{
+  using Self        = OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>;
+  using IndexRange  = camp::idx_seq<RangeInts...>;
   using IndexLinear = IdxLin;
-  using Base = RAJA::detail::LayoutBase_impl<IndexRange, IdxLin>;
+  using Base        = RAJA::detail::LayoutBase_impl<IndexRange, IdxLin>;
   Base base_;
 
   static constexpr camp::idx_t stride_one_dim = Base::stride_one_dim;
 
   static constexpr size_t n_dims = sizeof...(RangeInts);
-  IdxLin offsets[n_dims]={0}; //If not specified set to zero
+  IdxLin offsets[n_dims]         = {0};  // If not specified set to zero
 
-  constexpr RAJA_INLINE OffsetLayout_impl(
-      std::array<IdxLin, sizeof...(RangeInts)> begin,
-      std::array<IdxLin, sizeof...(RangeInts)> end)
-      : base_{(end[RangeInts] - begin[RangeInts])...},
-        offsets{begin[RangeInts]...}
-  {
-  }
+  constexpr RAJA_INLINE
+  OffsetLayout_impl(std::array<IdxLin, sizeof...(RangeInts)> begin,
+                    std::array<IdxLin, sizeof...(RangeInts)> end)
+      : base_ {(end[RangeInts] - begin[RangeInts])...},
+        offsets {begin[RangeInts]...}
+  {}
 
   constexpr RAJA_INLINE RAJA_HOST_DEVICE OffsetLayout_impl(Self const& c)
-      : base_(c.base_), offsets{c.offsets[RangeInts]...}
-  {
-  }
+      : base_(c.base_), offsets {c.offsets[RangeInts]...}
+  {}
 
   void shift(std::array<IdxLin, sizeof...(RangeInts)> shift)
   {
-    for(size_t i=0; i<n_dims; ++i) offsets[i] += shift[i];
+    for (size_t i = 0; i < n_dims; ++i)
+      offsets[i] += shift[i];
   }
 
-  template<camp::idx_t N, typename Idx>
+  template <camp::idx_t N, typename Idx>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheckError(Idx idx) const
   {
     printf("Error at index %d, value %ld is not within bounds [%ld, %ld] \n",
            static_cast<int>(N), static_cast<long int>(idx),
-           static_cast<long int>(offsets[N]), static_cast<long int>(offsets[N] + base_.sizes[N] - 1));
+           static_cast<long int>(offsets[N]),
+           static_cast<long int>(offsets[N] + base_.sizes[N] - 1));
     RAJA_ABORT_OR_THROW("Out of bounds error \n");
   }
 
   template <camp::idx_t N>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck() const
-  {
-  }
+  {}
 
   template <camp::idx_t N, typename Idx, typename... Indices>
-  RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck(Idx idx, Indices... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck(Idx idx,
+                                                Indices... indices) const
   {
-    if(!(offsets[N] <=idx && idx < offsets[N] + base_.sizes[N]))
+    if (!(offsets[N] <= idx && idx < offsets[N] + base_.sizes[N]))
     {
       BoundsCheckError<N>(idx);
     }
     RAJA_UNUSED_VAR(idx);
-    BoundsCheck<N+1>(indices...);
+    BoundsCheck<N + 1>(indices...);
   }
 
   template <typename... Indices>
-  RAJA_INLINE RAJA_HOST_DEVICE RAJA_BOUNDS_CHECK_constexpr IdxLin operator()(
-      Indices... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE RAJA_BOUNDS_CHECK_constexpr IdxLin
+  operator()(Indices... indices) const
   {
-#if defined (RAJA_BOUNDS_CHECK_INTERNAL)
+#if defined(RAJA_BOUNDS_CHECK_INTERNAL)
     BoundsCheck<0>(indices...);
 #endif
     return base_((indices - offsets[RangeInts])...);
@@ -108,7 +109,7 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
 
   template <typename... Indices>
   RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
-                                              Indices &&... indices) const
+                                              Indices&&... indices) const
   {
     base_.toIndices(linear_index, std::forward<Indices>(indices)...);
     camp::sink((indices = (offsets[RangeInts] + indices))...);
@@ -119,16 +120,15 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
       const std::array<IdxLin, sizeof...(RangeInts)>& offsets_in,
       const Layout<sizeof...(RangeInts), IdxLin>& rhs)
   {
-    OffsetLayout_impl ret{rhs};
+    OffsetLayout_impl ret {rhs};
     camp::sink((ret.offsets[RangeInts] = offsets_in[RangeInts])...);
     return ret;
   }
 
   constexpr RAJA_INLINE RAJA_HOST_DEVICE
   OffsetLayout_impl(const Layout<sizeof...(RangeInts), IdxLin>& rhs)
-      : base_{rhs}
-  {
-  }
+      : base_ {rhs}
+  {}
 
   RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin size() const
   {
@@ -140,27 +140,21 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
     return base_.size_noproj();
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_stride() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_stride() const
+  {
     return base_.get_dim_stride();
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_size() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_size() const
+  {
     return base_.get_dim_size();
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_begin() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_begin() const
+  {
     return offsets[DIM];
   }
 };
@@ -169,7 +163,8 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
 
 template <size_t n_dims = 1, typename IdxLin = Index_type>
 struct OffsetLayout
-    : public internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin> {
+    : public internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin>
+{
   using Base =
       internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin>;
 
@@ -179,56 +174,57 @@ struct OffsetLayout
   constexpr RAJA_INLINE RAJA_HOST_DEVICE OffsetLayout(
       const internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin>&
           rhs)
-      : Base{rhs}
-  {
-  }
+      : Base {rhs}
+  {}
 };
 
-//TypedOffsetLayout
+// TypedOffsetLayout
 template <typename IdxLin, typename DimTuple>
 struct TypedOffsetLayout;
 
 template <typename IdxLin, typename... DimTypes>
 struct TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>
-: public OffsetLayout<sizeof...(DimTypes), strip_index_type_t<IdxLin>>
+    : public OffsetLayout<sizeof...(DimTypes), strip_index_type_t<IdxLin>>
 {
-   using StrippedIdxLin = strip_index_type_t<IdxLin>;
-   using Self = TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>;
-   using Base = OffsetLayout<sizeof...(DimTypes), StrippedIdxLin>;
-   using DimArr = std::array<StrippedIdxLin, sizeof...(DimTypes)>;
-   using DimTuple = camp::tuple<DimTypes...>;
-   using IndexLinear = IdxLin;
-
-   // Pull in base coonstructors
- #if 0
+  using StrippedIdxLin = strip_index_type_t<IdxLin>;
+  using Self           = TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>;
+  using Base           = OffsetLayout<sizeof...(DimTypes), StrippedIdxLin>;
+  using DimArr         = std::array<StrippedIdxLin, sizeof...(DimTypes)>;
+  using DimTuple       = camp::tuple<DimTypes...>;
+  using IndexLinear    = IdxLin;
+
+  // Pull in base coonstructors
+#if 0
    // This breaks with nvcc11
  using Base::Base;
- #else
-   using OffsetLayout<sizeof...(DimTypes), StrippedIdxLin>::OffsetLayout;
- #endif
+#else
+  using OffsetLayout<sizeof...(DimTypes), StrippedIdxLin>::OffsetLayout;
+#endif
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(DimTypes... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin
+  operator()(DimTypes... indices) const
   {
     return IdxLin(Base::operator()(stripIndexType(indices)...));
   }
 
   RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
-                                              DimTypes &... indices) const
+                                              DimTypes&... indices) const
   {
-    toIndicesHelper(camp::make_idx_seq_t<sizeof...(DimTypes)>{},
+    toIndicesHelper(camp::make_idx_seq_t<sizeof...(DimTypes)> {},
                     std::forward<IdxLin>(linear_index),
-                    std::forward<DimTypes &>(indices)...);
+                    std::forward<DimTypes&>(indices)...);
   }
 
 private:
   template <typename... Indices, camp::idx_t... RangeInts>
   RAJA_INLINE RAJA_HOST_DEVICE void toIndicesHelper(camp::idx_seq<RangeInts...>,
                                                     IdxLin linear_index,
-                                                    Indices &... indices) const
+                                                    Indices&... indices) const
   {
     StrippedIdxLin locals[sizeof...(DimTypes)];
     Base::toIndices(stripIndexType(linear_index), locals[RangeInts]...);
-    camp::sink((indices = Indices{static_cast<Indices>(locals[RangeInts])})...);
+    camp::sink(
+        (indices = Indices {static_cast<Indices>(locals[RangeInts])})...);
   }
 };
 
@@ -238,7 +234,7 @@ auto make_offset_layout(const std::array<IdxLin, n_dims>& begin,
                         const std::array<IdxLin, n_dims>& end)
     -> OffsetLayout<n_dims, IdxLin>
 {
-  return OffsetLayout<n_dims, IdxLin>{begin, end};
+  return OffsetLayout<n_dims, IdxLin> {begin, end};
 }
 
 template <size_t Rank, typename IdxLin = Index_type>
@@ -248,7 +244,8 @@ auto make_permuted_offset_layout(const std::array<IdxLin, Rank>& begin,
     -> decltype(make_offset_layout<Rank, IdxLin>(begin, end))
 {
   std::array<IdxLin, Rank> sizes;
-  for (size_t i = 0; i < Rank; ++i) {
+  for (size_t i = 0; i < Rank; ++i)
+  {
     sizes[i] = end[i] - begin[i];
   }
   return internal::OffsetLayout_impl<camp::make_idx_seq_t<Rank>, IdxLin>::
diff --git a/include/RAJA/util/OffsetOperators.hpp b/include/RAJA/util/OffsetOperators.hpp
index 150aaeee34..77c880b08e 100644
--- a/include/RAJA/util/OffsetOperators.hpp
+++ b/include/RAJA/util/OffsetOperators.hpp
@@ -29,15 +29,19 @@ namespace RAJA
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct GetOffsetLeft
 {
-  template < typename new_Ret, typename new_Arg1 = new_Ret, typename new_Arg2 = new_Ret>
+  template <typename new_Ret,
+            typename new_Arg1 = new_Ret,
+            typename new_Arg2 = new_Ret>
   using rebind = GetOffsetLeft<new_Ret, new_Arg1, new_Arg2>;
 
-  template < size_t >
+  template <size_t>
   using rebunch = GetOffsetLeft<Ret, Arg1, Arg2>;
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr
-  Ret operator()(Arg1 const& i, Arg1 const& num_i,
-                 Arg2 const& j, Arg2 const& RAJA_UNUSED_ARG(num_j)) const noexcept
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret
+  operator()(Arg1 const& i,
+             Arg1 const& num_i,
+             Arg2 const& j,
+             Arg2 const& RAJA_UNUSED_ARG(num_j)) const noexcept
   {
     return i + j * num_i;
   }
@@ -46,35 +50,46 @@ struct GetOffsetLeft
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct GetOffsetRight
 {
-  template < typename new_Ret, typename new_Arg1 = new_Ret, typename new_Arg2 = new_Ret>
+  template <typename new_Ret,
+            typename new_Arg1 = new_Ret,
+            typename new_Arg2 = new_Ret>
   using rebind = GetOffsetRight<new_Ret, new_Arg1, new_Arg2>;
 
-  template < size_t >
+  template <size_t>
   using rebunch = GetOffsetRight<Ret, Arg1, Arg2>;
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr
-  Ret operator()(Arg1 const& i, Arg1 const& RAJA_UNUSED_ARG(num_i),
-                 Arg2 const& j, Arg2 const& num_j) const noexcept
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret
+  operator()(Arg1 const& i,
+             Arg1 const& RAJA_UNUSED_ARG(num_i),
+             Arg2 const& j,
+             Arg2 const& num_j) const noexcept
   {
     return i * num_j + j;
   }
 };
 
 template <size_t t_bunch_num_i,
-          typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
+          typename Ret,
+          typename Arg1 = Ret,
+          typename Arg2 = Arg1>
 struct GetOffsetLeftBunched
 {
-  template < typename new_Ret, typename new_Arg1 = new_Ret, typename new_Arg2 = new_Ret>
-  using rebind = GetOffsetLeftBunched<t_bunch_num_i, new_Ret, new_Arg1, new_Arg2>;
+  template <typename new_Ret,
+            typename new_Arg1 = new_Ret,
+            typename new_Arg2 = new_Ret>
+  using rebind =
+      GetOffsetLeftBunched<t_bunch_num_i, new_Ret, new_Arg1, new_Arg2>;
 
-  template < size_t new_bunch_num_i >
+  template <size_t new_bunch_num_i>
   using rebunch = GetOffsetLeftBunched<new_bunch_num_i, Ret, Arg1, Arg2>;
 
-  static constexpr Arg1 bunch_num_i{t_bunch_num_i};
+  static constexpr Arg1 bunch_num_i {t_bunch_num_i};
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr
-  Ret operator()(Arg1 const& i, Arg1 const& RAJA_UNUSED_ARG(num_i),
-                 Arg2 const& j, Arg2 const& num_j) const noexcept
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret
+  operator()(Arg1 const& i,
+             Arg1 const& RAJA_UNUSED_ARG(num_i),
+             Arg2 const& j,
+             Arg2 const& num_j) const noexcept
   {
     // assert(num_i >= bunch_num_i)
     Arg1 i_inner = i % bunch_num_i;
diff --git a/include/RAJA/util/Operators.hpp b/include/RAJA/util/Operators.hpp
index b4249e7182..1350a7085f 100644
--- a/include/RAJA/util/Operators.hpp
+++ b/include/RAJA/util/Operators.hpp
@@ -43,35 +43,38 @@ namespace detail
 {
 
 // truly associative (does not include fp add/multiply)
-struct associative_tag {
-};
+struct associative_tag
+{};
 
 // associative up to floating point rounding differences
-struct fp_associative_tag : associative_tag {
-};
+struct fp_associative_tag : associative_tag
+{};
 
 // get associativity tag appropriate for the type
-template < typename T >
+template <typename T>
 using associative_or_fp_associative_tag =
-  std::conditional_t<std::is_floating_point<std::decay_t<T>>::value,
-                     fp_associative_tag, associative_tag>;
+    std::conditional_t<std::is_floating_point<std::decay_t<T>>::value,
+                       fp_associative_tag,
+                       associative_tag>;
 
 template <typename Arg1, typename Arg2, typename Result>
-struct binary_function {
-  using first_argument_type = Arg1;
+struct binary_function
+{
+  using first_argument_type  = Arg1;
   using second_argument_type = Arg2;
-  using result_type = Result;
+  using result_type          = Result;
 };
 
 template <typename Argument, typename Result>
-struct unary_function {
+struct unary_function
+{
   using argument_type = Argument;
-  using result_type = Result;
+  using result_type   = Result;
 };
 
 template <typename Arg1, typename Arg2>
-struct comparison_function : public binary_function<Arg1, Arg2, bool> {
-};
+struct comparison_function : public binary_function<Arg1, Arg2, bool>
+{};
 
 }  // namespace detail
 
@@ -79,13 +82,15 @@ namespace types
 {
 
 template <typename T>
-struct is_unsigned_int {
+struct is_unsigned_int
+{
   static constexpr const bool value =
       std::is_unsigned<T>::value && std::is_integral<T>::value;
 };
 
 template <typename T>
-struct is_signed_int {
+struct is_signed_int
+{
   static constexpr const bool value =
       !std::is_unsigned<T>::value && std::is_integral<T>::value;
 };
@@ -96,51 +101,60 @@ struct is_signed_int {
    type)
 */
 template <typename T, bool GPU = false>
-struct larger {
-};
+struct larger
+{};
 
 template <>
-struct larger<uint8_t> {
+struct larger<uint8_t>
+{
   using type = uint16_t;
 };
 
 template <>
-struct larger<uint16_t> {
+struct larger<uint16_t>
+{
   using type = uint32_t;
 };
 
 template <>
-struct larger<uint32_t> {
+struct larger<uint32_t>
+{
   using type = uint64_t;
 };
 
 template <>
-struct larger<int8_t> {
+struct larger<int8_t>
+{
   using type = int16_t;
 };
 
 template <>
-struct larger<int16_t> {
+struct larger<int16_t>
+{
   using type = int32_t;
 };
 
 template <>
-struct larger<int32_t> {
+struct larger<int32_t>
+{
   using type = int64_t;
 };
 
 template <>
-struct larger<float> {
+struct larger<float>
+{
   using type = double;
 };
 
 template <>
-struct larger<double> {
+struct larger<double>
+{
   using type = long double;
 };
 
 template <>
-struct larger<double, true> {
+struct larger<double, true>
+{
   using type = double;
 };
 
@@ -148,26 +162,30 @@ namespace detail
 {
 
 template <typename T, bool isInt, bool isSigned, bool isFP, bool gpu = false>
-struct largest {
-};
+struct largest
+{};
 
 template <typename T>
-struct largest<T, true, false, false> {
+struct largest<T, true, false, false>
+{
   using type = uint64_t;
 };
 
 template <typename T>
-struct largest<T, true, true, false> {
+struct largest<T, true, true, false>
+{
   using type = int64_t;
 };
 
 template <typename T>
-struct largest<T, false, false, true, false> {
+struct largest<T, false, false, true, false>
+{
   using type = long double;
 };
 
 template <typename T>
-struct largest<T, false, false, true, true> {
+struct largest<T, false, false, true, true>
+{
   using type = double;
 };
 }  // namespace detail
@@ -177,7 +195,8 @@ struct largest<T, false, false, true, true> {
    pass 'true' as second template argument
 */
 template <typename T, bool gpu = false>
-struct largest {
+struct largest
+{
   using type = typename detail::largest<T,
                                         std::is_integral<T>::value,
                                         std::is_signed<T>::value,
@@ -187,30 +206,37 @@ struct largest {
 
 
 template <typename T>
-struct size_of {
-  enum { value = sizeof(T) };
+struct size_of
+{
+  enum
+  {
+    value = sizeof(T)
+  };
 };
 
 namespace detail
 {
 
 template <typename T, typename U, bool lhsLarger>
-struct larger_of {
-};
+struct larger_of
+{};
 
 template <typename T, typename U>
-struct larger_of<T, U, true> {
+struct larger_of<T, U, true>
+{
   using type = T;
 };
 
 template <typename T, typename U>
-struct larger_of<T, U, false> {
+struct larger_of<T, U, false>
+{
   using type = U;
 };
 }  // namespace detail
 
 template <typename T, typename U>
-struct larger_of {
+struct larger_of
+{
   using type = typename detail::
       larger_of<T, U, (size_of<T>::value > size_of<U>::value)>::type;
 };
@@ -218,7 +244,6 @@ struct larger_of {
 }  // namespace types
 
 
-
 template <typename T, typename Enable = void>
 struct limits;
 
@@ -226,27 +251,27 @@ struct limits;
 // limits for signed integer types
 template <typename T>
 struct limits<T,
-  typename std::enable_if<std::is_integral<T>::value &&
-  !std::is_unsigned<T>::value>::type>
+              typename std::enable_if<std::is_integral<T>::value &&
+                                      !std::is_unsigned<T>::value>::type>
 {
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T min()
   {
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( disable : 4309 )
+#pragma warning(disable : 4309)
 #endif
-    return static_cast<T>(1llu << ((8llu * sizeof(T)) - 1llu) );
+    return static_cast<T>(1llu << ((8llu * sizeof(T)) - 1llu));
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( default : 4309 )
+#pragma warning(default : 4309)
 #endif
   }
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T max()
   {
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( disable : 4309 )
+#pragma warning(disable : 4309)
 #endif
     return static_cast<T>(~(1llu << ((8llu * sizeof(T)) - 1llu)));
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( default : 4309 )
+#pragma warning(default : 4309)
 #endif
   }
 };
@@ -254,8 +279,8 @@ struct limits<T,
 // limits for signed integer types
 template <typename T>
 struct limits<T,
-  typename std::enable_if<std::is_integral<T>::value &&
-  std::is_unsigned<T>::value>::type>
+              typename std::enable_if<std::is_integral<T>::value &&
+                                      std::is_unsigned<T>::value>::type>
 {
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T min()
   {
@@ -264,42 +289,36 @@ struct limits<T,
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T max()
   {
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( disable : 4309 )
+#pragma warning(disable : 4309)
 #endif
     return static_cast<T>(0xFFFFFFFFFFFFFFFF);
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( default : 4309 )
+#pragma warning(default : 4309)
 #endif
   }
 };
 
 
 template <>
-struct limits<float> {
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr float min()
-  {
-    return -FLT_MAX;
-  }
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr float max()
-  {
-    return FLT_MAX;
-  }
+struct limits<float>
+{
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr float min() { return -FLT_MAX; }
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr float max() { return FLT_MAX; }
 };
 
 template <>
-struct limits<double> {
+struct limits<double>
+{
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr double min()
   {
     return -DBL_MAX;
   }
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr double max() 
-  { 
-     return DBL_MAX; 
-  }
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr double max() { return DBL_MAX; }
 };
 
 template <>
-struct limits<long double> {
+struct limits<long double>
+{
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr long double min()
   {
     return -LDBL_MAX;
@@ -338,51 +357,56 @@ static_assert(check<unsigned long long>(),
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct plus : public detail::binary_function<Arg1, Arg2, Ret>,
-              detail::associative_or_fp_associative_tag<Ret> {
+              detail::associative_or_fp_associative_tag<Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
-    return Ret{lhs} + rhs;
+    return Ret {lhs} + rhs;
   }
-  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret{0}; }
+  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret {0}; }
 };
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct minus : public detail::binary_function<Arg1, Arg2, Ret> {
+struct minus : public detail::binary_function<Arg1, Arg2, Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
-    return Ret{lhs} - rhs;
+    return Ret {lhs} - rhs;
   }
 };
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct multiplies : public detail::binary_function<Arg1, Arg2, Ret>,
-                    detail::associative_or_fp_associative_tag<Ret> {
+                    detail::associative_or_fp_associative_tag<Ret>
+{
 
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
-    return Ret{lhs} * rhs;
+    return Ret {lhs} * rhs;
   }
-  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret{1}; }
+  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret {1}; }
 };
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct divides : public detail::binary_function<Arg1, Arg2, Ret> {
+struct divides : public detail::binary_function<Arg1, Arg2, Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
-    return Ret{lhs} / rhs;
+    return Ret {lhs} / rhs;
   }
 };
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct modulus : public detail::binary_function<Arg1, Arg2, Ret> {
+struct modulus : public detail::binary_function<Arg1, Arg2, Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
-    return Ret{lhs} % rhs;
+    return Ret {lhs} % rhs;
   }
 };
 
@@ -390,7 +414,8 @@ struct modulus : public detail::binary_function<Arg1, Arg2, Ret> {
 
 template <typename Arg1, typename Arg2 = Arg1>
 struct logical_and : public detail::comparison_function<Arg1, Arg2>,
-                     detail::associative_tag {
+                     detail::associative_tag
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -401,7 +426,8 @@ struct logical_and : public detail::comparison_function<Arg1, Arg2>,
 
 template <typename Arg1, typename Arg2 = Arg1>
 struct logical_or : public detail::comparison_function<Arg1, Arg2>,
-                    detail::associative_tag {
+                    detail::associative_tag
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -411,7 +437,8 @@ struct logical_or : public detail::comparison_function<Arg1, Arg2>,
 };
 
 template <typename T>
-struct logical_not : public detail::unary_function<T, bool> {
+struct logical_not : public detail::unary_function<T, bool>
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const T& lhs) const
   {
     return !lhs;
@@ -421,30 +448,33 @@ struct logical_not : public detail::unary_function<T, bool> {
 // Bitwise
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct bit_or : public detail::binary_function<Arg1, Arg2, Ret> {
+struct bit_or : public detail::binary_function<Arg1, Arg2, Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
     return lhs | rhs;
   }
 
-RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret{0}; }
+  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret {0}; }
 };
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct bit_and : public detail::binary_function<Arg1, Arg2, Ret> {
+struct bit_and : public detail::binary_function<Arg1, Arg2, Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
     return lhs & rhs;
   }
 
-RAJA_HOST_DEVICE static constexpr Ret identity() { return ~Ret{0}; }
+  RAJA_HOST_DEVICE static constexpr Ret identity() { return ~Ret {0}; }
 };
 
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct bit_xor : public detail::binary_function<Arg1, Arg2, Ret> {
+struct bit_xor : public detail::binary_function<Arg1, Arg2, Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
@@ -461,7 +491,8 @@ struct bit_xor : public detail::binary_function<Arg1, Arg2, Ret> {
 */
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct minimum : public detail::binary_function<Arg1, Arg2, Ret>,
-                 detail::associative_tag {
+                 detail::associative_tag
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
@@ -475,7 +506,8 @@ struct minimum : public detail::binary_function<Arg1, Arg2, Ret>,
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct maximum : public detail::binary_function<Arg1, Arg2, Ret>,
-                 detail::associative_tag {
+                 detail::associative_tag
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
@@ -490,7 +522,8 @@ struct maximum : public detail::binary_function<Arg1, Arg2, Ret>,
 // Logical Comparison
 
 template <typename Arg1, typename Arg2 = Arg1>
-struct equal_to : public detail::comparison_function<Arg1, Arg2> {
+struct equal_to : public detail::comparison_function<Arg1, Arg2>
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -499,7 +532,8 @@ struct equal_to : public detail::comparison_function<Arg1, Arg2> {
 };
 
 template <typename Arg1, typename Arg2 = Arg1>
-struct not_equal_to : public detail::comparison_function<Arg1, Arg2> {
+struct not_equal_to : public detail::comparison_function<Arg1, Arg2>
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -508,7 +542,8 @@ struct not_equal_to : public detail::comparison_function<Arg1, Arg2> {
 };
 
 template <typename Arg1, typename Arg2 = Arg1>
-struct greater : public detail::comparison_function<Arg1, Arg2> {
+struct greater : public detail::comparison_function<Arg1, Arg2>
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -517,7 +552,8 @@ struct greater : public detail::comparison_function<Arg1, Arg2> {
 };
 
 template <typename Arg1, typename Arg2 = Arg1>
-struct less : public detail::comparison_function<Arg1, Arg2> {
+struct less : public detail::comparison_function<Arg1, Arg2>
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -527,7 +563,8 @@ struct less : public detail::comparison_function<Arg1, Arg2> {
 
 
 template <typename Arg1, typename Arg2 = Arg1>
-struct greater_equal : public detail::comparison_function<Arg1, Arg2> {
+struct greater_equal : public detail::comparison_function<Arg1, Arg2>
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -536,7 +573,8 @@ struct greater_equal : public detail::comparison_function<Arg1, Arg2> {
 };
 
 template <typename Arg1, typename Arg2 = Arg1>
-struct less_equal : public detail::comparison_function<Arg1, Arg2> {
+struct less_equal : public detail::comparison_function<Arg1, Arg2>
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -547,7 +585,8 @@ struct less_equal : public detail::comparison_function<Arg1, Arg2> {
 // Filters
 
 template <typename Ret, typename Orig = Ret>
-struct identity : public detail::unary_function<Orig, Ret> {
+struct identity : public detail::unary_function<Orig, Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Orig& lhs) const
   {
     return lhs;
@@ -555,7 +594,8 @@ struct identity : public detail::unary_function<Orig, Ret> {
 };
 
 template <typename T, typename U>
-struct project1st : public detail::binary_function<T, U, T> {
+struct project1st : public detail::binary_function<T, U, T>
+{
   RAJA_HOST_DEVICE constexpr T operator()(const T& lhs,
                                           const U& RAJA_UNUSED_ARG(rhs)) const
   {
@@ -564,7 +604,8 @@ struct project1st : public detail::binary_function<T, U, T> {
 };
 
 template <typename T, typename U = T>
-struct project2nd : public detail::binary_function<T, U, U> {
+struct project2nd : public detail::binary_function<T, U, U>
+{
   RAJA_HOST_DEVICE constexpr U operator()(const T& RAJA_UNUSED_ARG(lhs),
                                           const U& rhs) const
   {
@@ -575,13 +616,15 @@ struct project2nd : public detail::binary_function<T, U, U> {
 // Type Traits
 
 template <typename T>
-struct is_associative {
+struct is_associative
+{
   static constexpr const bool value =
       std::is_base_of<detail::associative_tag, T>::value;
 };
 
 template <typename T>
-struct is_fp_associative {
+struct is_fp_associative
+{
   static constexpr const bool value =
       std::is_base_of<detail::fp_associative_tag, T>::value;
 };
@@ -591,8 +634,8 @@ struct safe_plus
     : public plus<Arg1,
                   Arg2,
                   typename types::larger<
-                      typename types::larger_of<Arg1, Arg2>::type>::type> {
-};
+                      typename types::larger_of<Arg1, Arg2>::type>::type>
+{};
 
 }  // namespace operators
 
@@ -605,19 +648,20 @@ template <typename Function,
           typename Arg2 = Arg1>
 struct BinaryFunction
     : DefineConcept(::RAJA::concepts::convertible_to<Return>(
-          camp::val<Function>()(camp::val<Arg1>(), camp::val<Arg2>()))) {
-};
+          camp::val<Function>()(camp::val<Arg1>(), camp::val<Arg2>())))
+{};
 
 template <typename Function, typename Return, typename Arg = Return>
 struct UnaryFunction : DefineConcept(::RAJA::concepts::convertible_to<Return>(
-                           camp::val<Function>()(camp::val<Arg>()))) {
-};
+                           camp::val<Function>()(camp::val<Arg>())))
+{};
 
 namespace detail
 {
 
 template <typename Fun, typename Ret, typename T, typename U>
-using is_binary_function = ::RAJA::concepts::requires_<BinaryFunction, Ret, T, U>;
+using is_binary_function =
+    ::RAJA::concepts::requires_<BinaryFunction, Ret, T, U>;
 
 template <typename Fun, typename Ret, typename T>
 using is_unary_function = ::RAJA::concepts::requires_<UnaryFunction, Ret, T>;
diff --git a/include/RAJA/util/Permutations.hpp b/include/RAJA/util/Permutations.hpp
index e79e9f2830..2a70c4e760 100644
--- a/include/RAJA/util/Permutations.hpp
+++ b/include/RAJA/util/Permutations.hpp
@@ -31,46 +31,47 @@ template <typename Indices>
 struct as_array;
 
 template <camp::idx_t... Indices>
-struct as_array<camp::idx_seq<Indices...>> {
+struct as_array<camp::idx_seq<Indices...>>
+{
   static constexpr std::array<Index_type, sizeof...(Indices)> get()
   {
     return {{Indices...}};
   }
 };
 
-using PERM_I = camp::idx_seq<0>;
-using PERM_IJ = camp::idx_seq<0, 1>;
-using PERM_JI = camp::idx_seq<1, 0>;
-using PERM_IJK = camp::idx_seq<0, 1, 2>;
-using PERM_IKJ = camp::idx_seq<0, 2, 1>;
-using PERM_JIK = camp::idx_seq<1, 0, 2>;
-using PERM_JKI = camp::idx_seq<1, 2, 0>;
-using PERM_KIJ = camp::idx_seq<2, 0, 1>;
-using PERM_KJI = camp::idx_seq<2, 1, 0>;
-using PERM_IJKL = camp::idx_seq<0, 1, 2, 3>;
-using PERM_IJLK = camp::idx_seq<0, 1, 3, 2>;
-using PERM_IKJL = camp::idx_seq<0, 2, 1, 3>;
-using PERM_IKLJ = camp::idx_seq<0, 2, 3, 1>;
-using PERM_ILJK = camp::idx_seq<0, 3, 1, 2>;
-using PERM_ILKJ = camp::idx_seq<0, 3, 2, 1>;
-using PERM_JIKL = camp::idx_seq<1, 0, 2, 3>;
-using PERM_JILK = camp::idx_seq<1, 0, 3, 2>;
-using PERM_JKIL = camp::idx_seq<1, 2, 0, 3>;
-using PERM_JKLI = camp::idx_seq<1, 2, 3, 0>;
-using PERM_JLIK = camp::idx_seq<1, 3, 0, 2>;
-using PERM_JLKI = camp::idx_seq<1, 3, 2, 0>;
-using PERM_KIJL = camp::idx_seq<2, 0, 1, 3>;
-using PERM_KILJ = camp::idx_seq<2, 0, 3, 1>;
-using PERM_KJIL = camp::idx_seq<2, 1, 0, 3>;
-using PERM_KJLI = camp::idx_seq<2, 1, 3, 0>;
-using PERM_KLIJ = camp::idx_seq<2, 3, 0, 1>;
-using PERM_KLJI = camp::idx_seq<2, 3, 1, 0>;
-using PERM_LIJK = camp::idx_seq<3, 0, 1, 2>;
-using PERM_LIKJ = camp::idx_seq<3, 0, 2, 1>;
-using PERM_LJIK = camp::idx_seq<3, 1, 0, 2>;
-using PERM_LJKI = camp::idx_seq<3, 1, 2, 0>;
-using PERM_LKIJ = camp::idx_seq<3, 2, 0, 1>;
-using PERM_LKJI = camp::idx_seq<3, 2, 1, 0>;
+using PERM_I     = camp::idx_seq<0>;
+using PERM_IJ    = camp::idx_seq<0, 1>;
+using PERM_JI    = camp::idx_seq<1, 0>;
+using PERM_IJK   = camp::idx_seq<0, 1, 2>;
+using PERM_IKJ   = camp::idx_seq<0, 2, 1>;
+using PERM_JIK   = camp::idx_seq<1, 0, 2>;
+using PERM_JKI   = camp::idx_seq<1, 2, 0>;
+using PERM_KIJ   = camp::idx_seq<2, 0, 1>;
+using PERM_KJI   = camp::idx_seq<2, 1, 0>;
+using PERM_IJKL  = camp::idx_seq<0, 1, 2, 3>;
+using PERM_IJLK  = camp::idx_seq<0, 1, 3, 2>;
+using PERM_IKJL  = camp::idx_seq<0, 2, 1, 3>;
+using PERM_IKLJ  = camp::idx_seq<0, 2, 3, 1>;
+using PERM_ILJK  = camp::idx_seq<0, 3, 1, 2>;
+using PERM_ILKJ  = camp::idx_seq<0, 3, 2, 1>;
+using PERM_JIKL  = camp::idx_seq<1, 0, 2, 3>;
+using PERM_JILK  = camp::idx_seq<1, 0, 3, 2>;
+using PERM_JKIL  = camp::idx_seq<1, 2, 0, 3>;
+using PERM_JKLI  = camp::idx_seq<1, 2, 3, 0>;
+using PERM_JLIK  = camp::idx_seq<1, 3, 0, 2>;
+using PERM_JLKI  = camp::idx_seq<1, 3, 2, 0>;
+using PERM_KIJL  = camp::idx_seq<2, 0, 1, 3>;
+using PERM_KILJ  = camp::idx_seq<2, 0, 3, 1>;
+using PERM_KJIL  = camp::idx_seq<2, 1, 0, 3>;
+using PERM_KJLI  = camp::idx_seq<2, 1, 3, 0>;
+using PERM_KLIJ  = camp::idx_seq<2, 3, 0, 1>;
+using PERM_KLJI  = camp::idx_seq<2, 3, 1, 0>;
+using PERM_LIJK  = camp::idx_seq<3, 0, 1, 2>;
+using PERM_LIKJ  = camp::idx_seq<3, 0, 2, 1>;
+using PERM_LJIK  = camp::idx_seq<3, 1, 0, 2>;
+using PERM_LJKI  = camp::idx_seq<3, 1, 2, 0>;
+using PERM_LKIJ  = camp::idx_seq<3, 2, 0, 1>;
+using PERM_LKJI  = camp::idx_seq<3, 2, 1, 0>;
 using PERM_IJKLM = camp::idx_seq<0, 1, 2, 3, 4>;
 using PERM_IJKML = camp::idx_seq<0, 1, 2, 4, 3>;
 using PERM_IJLKM = camp::idx_seq<0, 1, 3, 2, 4>;
@@ -193,51 +194,51 @@ using PERM_MLKIJ = camp::idx_seq<4, 3, 2, 0, 1>;
 using PERM_MLKJI = camp::idx_seq<4, 3, 2, 1, 0>;
 
 
-
-
-namespace internal 
+namespace internal
 {
 
 
-template<camp::idx_t I, camp::idx_t J, camp::idx_t N, typename Perm>
+template <camp::idx_t I, camp::idx_t J, camp::idx_t N, typename Perm>
 struct CalcInversePermutationElem
 {
-  static constexpr camp::idx_t value = 
-    camp::seq_at<J, Perm>::value == I ? J : CalcInversePermutationElem<I, J+1, N, Perm>::value;
+  static constexpr camp::idx_t value =
+      camp::seq_at<J, Perm>::value == I
+          ? J
+          : CalcInversePermutationElem<I, J + 1, N, Perm>::value;
 };
 
-template<camp::idx_t I, camp::idx_t N, typename Perm>
+template <camp::idx_t I, camp::idx_t N, typename Perm>
 struct CalcInversePermutationElem<I, N, N, Perm>
 {
   static constexpr camp::idx_t value = I;
 };
 
 
-
-template<typename Range, typename Perm>
+template <typename Range, typename Perm>
 struct InversePermutationHelper;
 
-template<camp::idx_t ... Range, camp::idx_t ... Perm>
-struct InversePermutationHelper<camp::idx_seq<Range...>, 
-                                camp::idx_seq<Perm...>>
+template <camp::idx_t... Range, camp::idx_t... Perm>
+struct InversePermutationHelper<camp::idx_seq<Range...>, camp::idx_seq<Perm...>>
 {
   static_assert(sizeof...(Range) == sizeof...(Perm), "Fatal Error");
-  using type = camp::idx_seq< 
-    CalcInversePermutationElem<Range, 0, sizeof...(Range), camp::idx_seq<Perm...>>::value ...  
-  >;  
+  using type = camp::idx_seq<
+      CalcInversePermutationElem<Range,
+                                 0,
+                                 sizeof...(Range),
+                                 camp::idx_seq<Perm...>>::value...>;
 };
 
 
-
-} // namespace internal
-
+}  // namespace internal
 
 
 /*!
   Inverts a permutation
 */
-template<typename Perm>
-using invert_permutation = typename internal::InversePermutationHelper<camp::make_idx_seq_t<camp::size<Perm>::value>, Perm>::type;
+template <typename Perm>
+using invert_permutation = typename internal::InversePermutationHelper<
+    camp::make_idx_seq_t<camp::size<Perm>::value>,
+    Perm>::type;
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/util/PermutedLayout.hpp b/include/RAJA/util/PermutedLayout.hpp
index 5bb176215b..f7f1c627fb 100644
--- a/include/RAJA/util/PermutedLayout.hpp
+++ b/include/RAJA/util/PermutedLayout.hpp
@@ -67,26 +67,30 @@ auto make_permuted_layout(std::array<IdxLin, Rank> sizes,
 {
   std::array<IdxLin, Rank> strides;
   std::array<IdxLin, Rank> folded_strides;
-  for (size_t i = 0; i < Rank; ++i) {
+  for (size_t i = 0; i < Rank; ++i)
+  {
     // If the size of dimension i is zero, then the stride is zero
     folded_strides[i] = sizes[permutation[i]] ? 1 : 0;
-    for (size_t j = i + 1; j < Rank; ++j) {
+    for (size_t j = i + 1; j < Rank; ++j)
+    {
       folded_strides[i] *= sizes[permutation[j]] ? sizes[permutation[j]] : 1;
     }
   }
 
-  for (size_t i = 0; i < Rank; ++i) {
+  for (size_t i = 0; i < Rank; ++i)
+  {
     strides[permutation[i]] = folded_strides[i];
   }
 
 
   // return Layout<Rank, IdxLin>(sizes, strides);
-  auto ret  = Layout<Rank, IdxLin>();
-  for (size_t i = 0; i < Rank; ++i) {
-    ret.sizes[i] = sizes[i];
-    ret.strides[i] = strides[i];
+  auto ret = Layout<Rank, IdxLin>();
+  for (size_t i = 0; i < Rank; ++i)
+  {
+    ret.sizes[i]       = sizes[i];
+    ret.strides[i]     = strides[i];
     ret.inv_strides[i] = strides[i] ? strides[i] : 1;
-    ret.inv_mods[i] = sizes[i] ? sizes[i] : 1;
+    ret.inv_mods[i]    = sizes[i] ? sizes[i] : 1;
   }
   return ret;
 }
diff --git a/include/RAJA/util/PluginContext.hpp b/include/RAJA/util/PluginContext.hpp
index 996836e397..97aebf9431 100644
--- a/include/RAJA/util/PluginContext.hpp
+++ b/include/RAJA/util/PluginContext.hpp
@@ -11,31 +11,33 @@
 #include "RAJA/policy/PolicyBase.hpp"
 #include "RAJA/internal/get_platform.hpp"
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
 class KokkosPluginLoader;
 
-struct PluginContext {
-  public:
-    PluginContext(const Platform p) :
-      platform(p) {}
+struct PluginContext
+{
+public:
+  PluginContext(const Platform p) : platform(p) {}
 
-    Platform platform;
+  Platform platform;
 
-  private:
-    mutable uint64_t kID;
+private:
+  mutable uint64_t kID;
 
-    friend class KokkosPluginLoader;
+  friend class KokkosPluginLoader;
 };
 
-template<typename Policy>
+template <typename Policy>
 PluginContext make_context()
 {
-  return PluginContext{detail::get_platform<Policy>::value};
+  return PluginContext {detail::get_platform<Policy>::value};
 }
 
-} // closing brace for util namespace
-} // closing brace for RAJA namespace
+}  // namespace util
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/PluginLinker.hpp b/include/RAJA/util/PluginLinker.hpp
index e5b77bd027..5920142759 100644
--- a/include/RAJA/util/PluginLinker.hpp
+++ b/include/RAJA/util/PluginLinker.hpp
@@ -11,14 +11,18 @@
 #include "RAJA/util/RuntimePluginLoader.hpp"
 #include "RAJA/util/KokkosPluginLoader.hpp"
 
-namespace {
-  namespace anonymous_RAJA {
-    struct pluginLinker {
-      inline pluginLinker() {
-        (void)RAJA::util::linkRuntimePluginLoader();
-        (void)RAJA::util::linkKokkosPluginLoader();
-      }
-    } pluginLinker;
+namespace
+{
+namespace anonymous_RAJA
+{
+struct pluginLinker
+{
+  inline pluginLinker()
+  {
+    (void)RAJA::util::linkRuntimePluginLoader();
+    (void)RAJA::util::linkKokkosPluginLoader();
   }
-}
+} pluginLinker;
+}  // namespace anonymous_RAJA
+}  // namespace
 #endif
diff --git a/include/RAJA/util/PluginOptions.hpp b/include/RAJA/util/PluginOptions.hpp
index f0b6a35507..50ed3a1da9 100644
--- a/include/RAJA/util/PluginOptions.hpp
+++ b/include/RAJA/util/PluginOptions.hpp
@@ -10,22 +10,24 @@
 
 #include <string>
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
 struct PluginOptions
 {
-    PluginOptions(const std::string& newstr) : str(newstr) {};
-    
-    std::string str;
+  PluginOptions(const std::string& newstr) : str(newstr) {};
+
+  std::string str;
 };
 
 inline PluginOptions make_options(const std::string& newstr)
 {
-    return PluginOptions{newstr};
+  return PluginOptions {newstr};
 }
 
-} // namespace util
-} // namespace RAJA
+}  // namespace util
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/PluginStrategy.hpp b/include/RAJA/util/PluginStrategy.hpp
index 3935559bba..86f8fd7f6b 100644
--- a/include/RAJA/util/PluginStrategy.hpp
+++ b/include/RAJA/util/PluginStrategy.hpp
@@ -12,33 +12,35 @@
 #include "RAJA/util/PluginOptions.hpp"
 #include "RAJA/util/Registry.hpp"
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
 class PluginStrategy
 {
-  public:
-    RAJASHAREDDLL_API PluginStrategy();
+public:
+  RAJASHAREDDLL_API PluginStrategy();
 
-    virtual ~PluginStrategy() = default;
+  virtual ~PluginStrategy() = default;
 
-    virtual RAJASHAREDDLL_API void init(const PluginOptions& p);
+  virtual RAJASHAREDDLL_API void init(const PluginOptions& p);
 
-    virtual RAJASHAREDDLL_API void preCapture(const PluginContext& p);
+  virtual RAJASHAREDDLL_API void preCapture(const PluginContext& p);
 
-    virtual RAJASHAREDDLL_API void postCapture(const PluginContext& p);
+  virtual RAJASHAREDDLL_API void postCapture(const PluginContext& p);
 
-    virtual RAJASHAREDDLL_API void preLaunch(const PluginContext& p);
+  virtual RAJASHAREDDLL_API void preLaunch(const PluginContext& p);
 
-    virtual RAJASHAREDDLL_API void postLaunch(const PluginContext& p);
+  virtual RAJASHAREDDLL_API void postLaunch(const PluginContext& p);
 
-    virtual RAJASHAREDDLL_API void finalize();
+  virtual RAJASHAREDDLL_API void finalize();
 };
 
 using PluginRegistry = Registry<PluginStrategy>;
 
-} // closing brace for util namespace
-} // closing brace for RAJA namespace
+}  // namespace util
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/util/Registry.hpp b/include/RAJA/util/Registry.hpp
index 579481a6ed..4bfb2ee7b8 100644
--- a/include/RAJA/util/Registry.hpp
+++ b/include/RAJA/util/Registry.hpp
@@ -10,126 +10,147 @@
 
 #include <memory>
 
-namespace RAJA {
-namespace util {
-
-  template <typename T>
-  class RegistryEntry {
-    std::string Name, Desc;
-    std::shared_ptr<T> object;
+namespace RAJA
+{
+namespace util
+{
+
+template <typename T>
+class RegistryEntry
+{
+  std::string Name, Desc;
+  std::shared_ptr<T> object;
+
+public:
+  RegistryEntry(const std::string& N,
+                const std::string& D,
+                std::shared_ptr<T> (*C)())
+      : Name(N), Desc(D), object(C())
+  {}
+
+  const std::string& getName() const { return Name; }
+  const std::string& getDesc() const { return Desc; }
+  T* get() const { return object.get(); }
+};
+
+/// A global registry used in conjunction with static constructors to make
+/// pluggable components (like targets or garbage collectors) "just work" when
+/// linked with an executable.
+template <typename T>
+class Registry
+{
+public:
+  using type  = T;
+  using entry = RegistryEntry<T>;
+
+  class node;
+  class iterator;
+
+private:
+  Registry() = delete;
+
+  friend class node;
+  static node *Head, *Tail;
+
+public:
+  /// Node in linked list of entries.
+  ///
+  class node
+  {
+    friend class iterator;
+    friend Registry<T>;
+
+    node* Next;
+    const entry& Val;
 
   public:
-    RegistryEntry(const std::string& N, const std::string& D,
-        std::shared_ptr<T> (*C)())
-        : Name(N), Desc(D), object(C()) {}
-
-    const std::string& getName() const { return Name; }
-    const std::string& getDesc() const { return Desc; }
-    T* get() const { return object.get(); }
+    node(const entry& V) : Next(nullptr), Val(V) {}
   };
 
-  /// A global registry used in conjunction with static constructors to make
-  /// pluggable components (like targets or garbage collectors) "just work" when
-  /// linked with an executable.
-  template <typename T>
-  class Registry {
+  /// Add a node to the Registry: this is the interface between the plugin and
+  /// the executable.
+  ///
+  /// This function is exported by the executable and called by the plugin to
+  /// add a node to the executable's registry. Therefore it's not defined here
+  /// to avoid it being instantiated in the plugin and is instead defined in
+  /// the executable (see RAJA_INSTANTIATE_REGISTRY below).
+  static RAJASHAREDDLL_API void add_node(node* N);
+
+  /// Iterators for registry entries.
+  ///
+  class iterator
+  {
+    const node* Cur;
+
   public:
-    using type = T;
-    using entry = RegistryEntry<T>;
+    explicit iterator(const node* N) : Cur(N) {}
+
+    bool operator==(const iterator& That) const { return Cur == That.Cur; }
+    bool operator!=(const iterator& That) const { return Cur != That.Cur; }
+    iterator& operator++()
+    {
+      Cur = Cur->Next;
+      return *this;
+    }
+    const entry& operator*() const { return Cur->Val; }
+    const entry* operator->() const { return &Cur->Val; }
+  };
 
-    class node;
-    class iterator;
+  // begin is not defined here in order to avoid usage of an undefined static
+  // data member, instead it's instantiated by RAJA_INSTANTIATE_REGISTRY.
+  static RAJASHAREDDLL_API iterator begin();
+  static iterator end() { return iterator(nullptr); }
 
-  private:
-    Registry() = delete;
+  /// A static registration template.
+  template <typename V>
+  class add
+  {
+    entry Entry;
+    node Node;
 
-    friend class node;
-    static node *Head, *Tail;
+    static std::shared_ptr<T> CtorFn() { return std::make_shared<V>(); }
 
   public:
-    /// Node in linked list of entries.
-    ///
-    class node {
-      friend class iterator;
-      friend Registry<T>;
-
-      node *Next;
-      const entry& Val;
-
-    public:
-      node(const entry &V) : Next(nullptr), Val(V) {}
-    };
-
-    /// Add a node to the Registry: this is the interface between the plugin and
-    /// the executable.
-    ///
-    /// This function is exported by the executable and called by the plugin to
-    /// add a node to the executable's registry. Therefore it's not defined here
-    /// to avoid it being instantiated in the plugin and is instead defined in
-    /// the executable (see RAJA_INSTANTIATE_REGISTRY below).
-    static RAJASHAREDDLL_API void add_node(node *N);
-
-    /// Iterators for registry entries.
-    ///
-    class iterator {
-      const node *Cur;
-
-    public:
-      explicit iterator(const node *N) : Cur(N) {}
-
-      bool operator==(const iterator &That) const { return Cur == That.Cur; }
-      bool operator!=(const iterator &That) const { return Cur != That.Cur; }
-      iterator &operator++() { Cur = Cur->Next; return *this; }
-      const entry &operator*() const { return Cur->Val; }
-      const entry *operator->() const { return &Cur->Val; }
-    };
-
-    // begin is not defined here in order to avoid usage of an undefined static
-    // data member, instead it's instantiated by RAJA_INSTANTIATE_REGISTRY.
-    static RAJASHAREDDLL_API iterator begin();
-    static iterator end()   { return iterator(nullptr); }
-
-    /// A static registration template.
-    template <typename V>
-    class add {
-      entry Entry;
-      node Node;
-
-      static std::shared_ptr<T> CtorFn() { return std::make_shared<V>(); }
-
-    public:
-      add(const std::string& Name, const std::string& Desc)
-          : Entry(Name, Desc, CtorFn), Node(Entry) {
-        add_node(&Node);
-      }
-    };
+    add(const std::string& Name, const std::string& Desc)
+        : Entry(Name, Desc, CtorFn), Node(Entry)
+    {
+      add_node(&Node);
+    }
   };
-
-} // closing brace for util namespace
-} // closing brace for RAJA namespace
-
-#define RAJA_INSTANTIATE_REGISTRY(REGISTRY_CLASS) \
-  namespace RAJA { \
-  namespace util { \
-  template<typename T> typename Registry<T>::node *Registry<T>::Head = nullptr;\
-  template<typename T> typename Registry<T>::node *Registry<T>::Tail = nullptr;\
-  template<typename T> \
-  void Registry<T>::add_node(typename Registry<T>::node *N) { \
-    if (Tail) \
-      Tail->Next = N; \
-    else \
-      Head = N; \
-    Tail = N; \
-  } \
-  template<typename T> typename Registry<T>::iterator Registry<T>::begin() { \
-    return iterator(Head); \
-  } \
-  template REGISTRY_CLASS::node *Registry<REGISTRY_CLASS::type>::Head; \
-  template REGISTRY_CLASS::node *Registry<REGISTRY_CLASS::type>::Tail; \
-  template \
-  void Registry<REGISTRY_CLASS::type>::add_node(REGISTRY_CLASS::node*); \
-  template REGISTRY_CLASS::iterator Registry<REGISTRY_CLASS::type>::begin(); \
-  } \
+};
+
+}  // namespace util
+}  // namespace RAJA
+
+#define RAJA_INSTANTIATE_REGISTRY(REGISTRY_CLASS)                              \
+  namespace RAJA                                                               \
+  {                                                                            \
+  namespace util                                                               \
+  {                                                                            \
+  template <typename T>                                                        \
+  typename Registry<T>::node* Registry<T>::Head = nullptr;                     \
+  template <typename T>                                                        \
+  typename Registry<T>::node* Registry<T>::Tail = nullptr;                     \
+  template <typename T>                                                        \
+  void Registry<T>::add_node(typename Registry<T>::node* N)                    \
+  {                                                                            \
+    if (Tail)                                                                  \
+      Tail->Next = N;                                                          \
+    else                                                                       \
+      Head = N;                                                                \
+    Tail = N;                                                                  \
+  }                                                                            \
+  template <typename T>                                                        \
+  typename Registry<T>::iterator Registry<T>::begin()                          \
+  {                                                                            \
+    return iterator(Head);                                                     \
+  }                                                                            \
+  template REGISTRY_CLASS::node* Registry<REGISTRY_CLASS::type>::Head;         \
+  template REGISTRY_CLASS::node* Registry<REGISTRY_CLASS::type>::Tail;         \
+  template void                                                                \
+  Registry<REGISTRY_CLASS::type>::add_node(REGISTRY_CLASS::node*);             \
+  template REGISTRY_CLASS::iterator Registry<REGISTRY_CLASS::type>::begin();   \
+  }                                                                            \
   }
 
 #endif
diff --git a/include/RAJA/util/RepeatView.hpp b/include/RAJA/util/RepeatView.hpp
index 618913f794..0f8110288b 100644
--- a/include/RAJA/util/RepeatView.hpp
+++ b/include/RAJA/util/RepeatView.hpp
@@ -50,75 +50,132 @@ namespace RAJA
  *   unbounded extents
  *
  */
-template < typename T >
+template <typename T>
 struct RepeatView
 {
   struct iterator
   {
     using difference_type = std::ptrdiff_t;
-    using value_type = T;
-    using reference = value_type const&;
+    using value_type      = T;
+    using reference       = value_type const&;
 
     iterator() = default;
 
     constexpr iterator(const T* base, size_t index)
-      : m_value(base), m_index(index)
-    { }
+        : m_value(base), m_index(index)
+    {}
 
     constexpr reference operator*() const noexcept { return *m_value; }
-    constexpr reference operator[](difference_type index) const noexcept { return *(*this + index); }
-
-    constexpr iterator& operator++() { ++m_index; return *this; }
-    constexpr iterator operator++(int) { auto tmp = *this; ++(*this); return tmp; }
-
-    constexpr iterator& operator--() { --m_index; return *this; }
-    constexpr iterator operator--(int) { auto tmp = *this; --(*this); return tmp; }
-
-    constexpr iterator& operator+=(difference_type rhs) { m_index += rhs; return *this; }
-    constexpr iterator& operator-=(difference_type rhs) { m_index -= rhs; return *this; }
+    constexpr reference operator[](difference_type index) const noexcept
+    {
+      return *(*this + index);
+    }
+
+    constexpr iterator& operator++()
+    {
+      ++m_index;
+      return *this;
+    }
+    constexpr iterator operator++(int)
+    {
+      auto tmp = *this;
+      ++(*this);
+      return tmp;
+    }
+
+    constexpr iterator& operator--()
+    {
+      --m_index;
+      return *this;
+    }
+    constexpr iterator operator--(int)
+    {
+      auto tmp = *this;
+      --(*this);
+      return tmp;
+    }
+
+    constexpr iterator& operator+=(difference_type rhs)
+    {
+      m_index += rhs;
+      return *this;
+    }
+    constexpr iterator& operator-=(difference_type rhs)
+    {
+      m_index -= rhs;
+      return *this;
+    }
 
     friend constexpr iterator operator+(iterator lhs, difference_type rhs)
-    { lhs += rhs; return lhs; }
+    {
+      lhs += rhs;
+      return lhs;
+    }
     friend constexpr iterator operator+(difference_type lhs, iterator rhs)
-    { rhs += lhs; return rhs; }
+    {
+      rhs += lhs;
+      return rhs;
+    }
 
     friend constexpr iterator operator-(iterator lhs, difference_type rhs)
-    { lhs -= rhs; return lhs; }
-    friend constexpr difference_type operator-(iterator const& lhs, iterator const& rhs)
-    { return static_cast<difference_type>(lhs.m_index) - static_cast<difference_type>(rhs.m_index); }
+    {
+      lhs -= rhs;
+      return lhs;
+    }
+    friend constexpr difference_type operator-(iterator const& lhs,
+                                               iterator const& rhs)
+    {
+      return static_cast<difference_type>(lhs.m_index) -
+             static_cast<difference_type>(rhs.m_index);
+    }
 
     friend constexpr bool operator==(iterator const& lhs, iterator const& rhs)
-    { return lhs.m_index == rhs.m_index; }
+    {
+      return lhs.m_index == rhs.m_index;
+    }
     friend constexpr bool operator!=(iterator const& lhs, iterator const& rhs)
-    { return !(lhs == rhs); }
+    {
+      return !(lhs == rhs);
+    }
 
     friend constexpr bool operator<(iterator const& lhs, iterator const& rhs)
-    { return lhs.m_index < rhs.m_index; }
+    {
+      return lhs.m_index < rhs.m_index;
+    }
     friend constexpr bool operator<=(iterator const& lhs, iterator const& rhs)
-    { return !(rhs < lhs); }
+    {
+      return !(rhs < lhs);
+    }
     friend constexpr bool operator>(iterator const& lhs, iterator const& rhs)
-    { return rhs < lhs; }
+    {
+      return rhs < lhs;
+    }
     friend constexpr bool operator>=(iterator const& lhs, iterator const& rhs)
-    { return !(lhs < rhs); }
+    {
+      return !(lhs < rhs);
+    }
 
   private:
     const T* m_value = nullptr;
-    size_t m_index = 0;
+    size_t m_index   = 0;
   };
 
   RepeatView() = delete;
 
   constexpr RepeatView(T const& value, size_t bound)
-    : m_bound(bound), m_value(value)
-  { }
+      : m_bound(bound), m_value(value)
+  {}
 
   constexpr RepeatView(T&& value, size_t bound)
-    : m_bound(bound), m_value(std::move(value))
-  { }
+      : m_bound(bound), m_value(std::move(value))
+  {}
 
   constexpr T const& front() const { return m_value; }
   constexpr T const& back() const { return m_value; }
-  constexpr T const& operator[](size_t RAJA_UNUSED_ARG(index)) const { return m_value; }
+  constexpr T const& operator[](size_t RAJA_UNUSED_ARG(index)) const
+  {
+    return m_value;
+  }
 
   constexpr iterator begin() const { return iterator(&m_value, 0); }
   constexpr iterator cbegin() const { return iterator(&m_value, 0); }
diff --git a/include/RAJA/util/RuntimePluginLoader.hpp b/include/RAJA/util/RuntimePluginLoader.hpp
index 3e7fbb165f..289e067b0a 100644
--- a/include/RAJA/util/RuntimePluginLoader.hpp
+++ b/include/RAJA/util/RuntimePluginLoader.hpp
@@ -14,39 +14,40 @@
 #include "RAJA/util/PluginOptions.hpp"
 #include "RAJA/util/PluginStrategy.hpp"
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
-  class RuntimePluginLoader : public RAJA::util::PluginStrategy
-  {
-    using Parent = RAJA::util::PluginStrategy;
+class RuntimePluginLoader : public RAJA::util::PluginStrategy
+{
+  using Parent = RAJA::util::PluginStrategy;
 
-  public:
-    RuntimePluginLoader();
+public:
+  RuntimePluginLoader();
 
-    void init(const RAJA::util::PluginOptions& p) override;
+  void init(const RAJA::util::PluginOptions& p) override;
 
-    void preCapture(const RAJA::util::PluginContext& p) override;
+  void preCapture(const RAJA::util::PluginContext& p) override;
 
-    void postCapture(const RAJA::util::PluginContext& p) override;
+  void postCapture(const RAJA::util::PluginContext& p) override;
 
-    void preLaunch(const RAJA::util::PluginContext& p) override;
+  void preLaunch(const RAJA::util::PluginContext& p) override;
 
-    void postLaunch(const RAJA::util::PluginContext& p) override;
+  void postLaunch(const RAJA::util::PluginContext& p) override;
 
-    void finalize() override;
+  void finalize() override;
 
-  private:
+private:
+  void initPlugin(const std::string& path);
 
-    void initPlugin(const std::string &path);
-    
-    void initDirectory(const std::string &path);
+  void initDirectory(const std::string& path);
 
-    std::vector<std::unique_ptr<Parent>> plugins;
+  std::vector<std::unique_ptr<Parent>> plugins;
 
-  };  // end RuntimePluginLoader class
+};  // end RuntimePluginLoader class
 
-  void linkRuntimePluginLoader();
+void linkRuntimePluginLoader();
 
 }  // end namespace util
 }  // end namespace RAJA
diff --git a/include/RAJA/util/SoAArray.hpp b/include/RAJA/util/SoAArray.hpp
index 6828bc3b1a..c4d63ac19f 100644
--- a/include/RAJA/util/SoAArray.hpp
+++ b/include/RAJA/util/SoAArray.hpp
@@ -52,10 +52,10 @@ class SoAArray
  * @brief Specialization for RAJA::reduce::detail::ValueLoc.
  */
 template <typename T, typename IndexType, bool doing_min, size_t size>
-class SoAArray< ::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, size>
+class SoAArray<::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, size>
 {
-  using value_type = ::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>;
-  using first_type = T;
+  using value_type  = ::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>;
+  using first_type  = T;
   using second_type = IndexType;
 
 public:
@@ -65,7 +65,7 @@ class SoAArray< ::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, size>
   }
   RAJA_HOST_DEVICE void set(size_t i, value_type val)
   {
-    mem[i] = val;
+    mem[i]     = val;
     mem_idx[i] = val.getLoc();
   }
 
diff --git a/include/RAJA/util/SoAPtr.hpp b/include/RAJA/util/SoAPtr.hpp
index 6adea65b80..2aea24a82c 100644
--- a/include/RAJA/util/SoAPtr.hpp
+++ b/include/RAJA/util/SoAPtr.hpp
@@ -45,35 +45,35 @@ namespace detail
 template <typename T,
           typename mempool = RAJA::basic_mempool::MemPool<
               RAJA::basic_mempool::generic_allocator>,
-          typename accessor = DefaultAccessor >
+          typename accessor = DefaultAccessor>
 class SoAPtr
 {
-  template < typename, typename, typename >
-  friend class SoAPtr; // friend other instantiations of this class
+  template <typename, typename, typename>
+  friend class SoAPtr;  // friend other instantiations of this class
 
 public:
   using value_type = T;
 
-  template < typename rhs_accessor >
+  template <typename rhs_accessor>
   using rebind_accessor = SoAPtr<T, mempool, rhs_accessor>;
 
-  SoAPtr() = default;
-  SoAPtr(SoAPtr const&) = default;
-  SoAPtr(SoAPtr &&) = default;
+  SoAPtr()                         = default;
+  SoAPtr(SoAPtr const&)            = default;
+  SoAPtr(SoAPtr&&)                 = default;
   SoAPtr& operator=(SoAPtr const&) = default;
-  SoAPtr& operator=(SoAPtr &&) = default;
+  SoAPtr& operator=(SoAPtr&&)      = default;
 
   explicit SoAPtr(size_t size)
       : mem(mempool::getInstance().template malloc<value_type>(size))
-  {
-  }
+  {}
 
-  template < typename rhs_accessor,
-             std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr >
-  RAJA_HOST_DEVICE
-  explicit SoAPtr(SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
-    : mem(rhs.mem)
-  { }
+  template <
+      typename rhs_accessor,
+      std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr>
+  RAJA_HOST_DEVICE explicit SoAPtr(
+      SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
+      : mem(rhs.mem)
+  {}
 
   SoAPtr& allocate(size_t size)
   {
@@ -90,8 +90,14 @@ class SoAPtr
 
   RAJA_HOST_DEVICE bool allocated() const { return mem != nullptr; }
 
-  RAJA_HOST_DEVICE value_type get(size_t i) const { return accessor::get(mem, i); }
-  RAJA_HOST_DEVICE void set(size_t i, value_type val) { accessor::set(mem, i, val); }
+  RAJA_HOST_DEVICE value_type get(size_t i) const
+  {
+    return accessor::get(mem, i);
+  }
+  RAJA_HOST_DEVICE void set(size_t i, value_type val)
+  {
+    accessor::set(mem, i, val);
+  }
 
 private:
   value_type* mem = nullptr;
@@ -100,44 +106,49 @@ class SoAPtr
 /*!
  * @brief Specialization for RAJA::reduce::detail::ValueLoc.
  */
-template <typename T, typename IndexType, bool doing_min, typename mempool, typename accessor>
-class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, mempool, accessor>
+template <typename T,
+          typename IndexType,
+          bool doing_min,
+          typename mempool,
+          typename accessor>
+class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>,
+             mempool,
+             accessor>
 {
-  using first_type = T;
+  using first_type  = T;
   using second_type = IndexType;
 
-  template < typename, typename, typename >
-  friend class SoAPtr; // fiend other instantiations of this class
+  template <typename, typename, typename>
+  friend class SoAPtr;  // fiend other instantiations of this class
 
 public:
   using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>;
 
-  template < typename rhs_accessor >
+  template <typename rhs_accessor>
   using rebind_accessor = SoAPtr<value_type, mempool, rhs_accessor>;
 
-  SoAPtr() = default;
-  SoAPtr(SoAPtr const&) = default;
-  SoAPtr(SoAPtr &&) = default;
+  SoAPtr()                         = default;
+  SoAPtr(SoAPtr const&)            = default;
+  SoAPtr(SoAPtr&&)                 = default;
   SoAPtr& operator=(SoAPtr const&) = default;
-  SoAPtr& operator=(SoAPtr &&) = default;
+  SoAPtr& operator=(SoAPtr&&)      = default;
 
   explicit SoAPtr(size_t size)
       : mem(mempool::getInstance().template malloc<first_type>(size)),
         mem_idx(mempool::getInstance().template malloc<second_type>(size))
-  {
-  }
+  {}
 
-  template < typename rhs_accessor,
-             std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr >
-  RAJA_HOST_DEVICE
-  explicit SoAPtr(SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
-    : mem(rhs.mem)
-    , mem_idx(rhs.mem_idx)
-  { }
+  template <
+      typename rhs_accessor,
+      std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr>
+  RAJA_HOST_DEVICE explicit SoAPtr(
+      SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
+      : mem(rhs.mem), mem_idx(rhs.mem_idx)
+  {}
 
   SoAPtr& allocate(size_t size)
   {
-    mem = mempool::getInstance().template malloc<first_type>(size);
+    mem     = mempool::getInstance().template malloc<first_type>(size);
     mem_idx = mempool::getInstance().template malloc<second_type>(size);
     return *this;
   }
@@ -164,7 +175,7 @@ class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, mempool, a
   }
 
 private:
-  first_type* mem = nullptr;
+  first_type* mem      = nullptr;
   second_type* mem_idx = nullptr;
 };
 
@@ -174,41 +185,40 @@ class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, mempool, a
 template <typename T, typename IndexType, typename mempool, typename accessor>
 class SoAPtr<RAJA::expt::ValLoc<T, IndexType>, mempool, accessor>
 {
-  using first_type = T;
+  using first_type  = T;
   using second_type = IndexType;
 
-  template < typename, typename, typename >
-  friend class SoAPtr; // friend other instantiations of this class
+  template <typename, typename, typename>
+  friend class SoAPtr;  // friend other instantiations of this class
 
 public:
   using value_type = RAJA::expt::ValLoc<T, IndexType>;
 
-  template < typename rhs_accessor >
+  template <typename rhs_accessor>
   using rebind_accessor = SoAPtr<value_type, mempool, rhs_accessor>;
 
-  SoAPtr() = default;
-  SoAPtr(SoAPtr const&) = default;
-  SoAPtr(SoAPtr &&) = default;
+  SoAPtr()                         = default;
+  SoAPtr(SoAPtr const&)            = default;
+  SoAPtr(SoAPtr&&)                 = default;
   SoAPtr& operator=(SoAPtr const&) = default;
-  SoAPtr& operator=(SoAPtr &&) = default;
+  SoAPtr& operator=(SoAPtr&&)      = default;
 
   explicit SoAPtr(size_t size)
       : mem(mempool::getInstance().template malloc<first_type>(size)),
         mem_idx(mempool::getInstance().template malloc<second_type>(size))
-  {
-  }
+  {}
 
-  template < typename rhs_accessor,
-             std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr >
-  RAJA_HOST_DEVICE
-  explicit SoAPtr(SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
-    : mem(rhs.mem)
-    , mem_idx(rhs.mem_idx)
-  { }
+  template <
+      typename rhs_accessor,
+      std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr>
+  RAJA_HOST_DEVICE explicit SoAPtr(
+      SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
+      : mem(rhs.mem), mem_idx(rhs.mem_idx)
+  {}
 
   SoAPtr& allocate(size_t size)
   {
-    mem = mempool::getInstance().template malloc<first_type>(size);
+    mem     = mempool::getInstance().template malloc<first_type>(size);
     mem_idx = mempool::getInstance().template malloc<second_type>(size);
     return *this;
   }
@@ -235,7 +245,7 @@ class SoAPtr<RAJA::expt::ValLoc<T, IndexType>, mempool, accessor>
   }
 
 private:
-  first_type* mem = nullptr;
+  first_type* mem      = nullptr;
   second_type* mem_idx = nullptr;
 };
 
diff --git a/include/RAJA/util/Span.hpp b/include/RAJA/util/Span.hpp
index 2da2e0164c..61c3addb26 100644
--- a/include/RAJA/util/Span.hpp
+++ b/include/RAJA/util/Span.hpp
@@ -56,15 +56,16 @@ namespace RAJA
  *
  */
 template <typename IterType, typename IndexType>
-struct Span {
-  using element_type = typename std::iterator_traits<IterType>::value_type;
-  using value_type = camp::decay<element_type>;
-  using size_type = IndexType;
+struct Span
+{
+  using element_type    = typename std::iterator_traits<IterType>::value_type;
+  using value_type      = camp::decay<element_type>;
+  using size_type       = IndexType;
   using difference_type = std::ptrdiff_t;
-  using reference = element_type&;
+  using reference       = element_type&;
   using const_reference = const element_type&;
-  using iterator = IterType;
-  using const_iterator = IterType;
+  using iterator        = IterType;
+  using const_iterator  = IterType;
 
   static_assert(type_traits::is_integral<IndexType>::value,
                 "IndexType must model Integral");
@@ -72,14 +73,12 @@ struct Span {
                 "IterType must model RandomAccessIterator");
 
   RAJA_HOST_DEVICE Span(iterator begin, iterator end)
-      : m_begin{begin}, m_end{end}
-  {
-  }
+      : m_begin {begin}, m_end {end}
+  {}
 
   RAJA_HOST_DEVICE Span(iterator begin, size_type size)
-      : m_begin{begin}, m_end{begin + size}
-  {
-  }
+      : m_begin {begin}, m_end {begin + size}
+  {}
 
   RAJA_HOST_DEVICE RAJA_INLINE iterator begin() { return m_begin; }
   RAJA_HOST_DEVICE RAJA_INLINE iterator end() { return m_end; }
@@ -88,16 +87,34 @@ struct Span {
   RAJA_HOST_DEVICE RAJA_INLINE const_iterator cbegin() const { return m_begin; }
   RAJA_HOST_DEVICE RAJA_INLINE const_iterator cend() const { return m_end; }
 
-  RAJA_HOST_DEVICE RAJA_INLINE friend iterator begin(Span& s) { return s.begin(); }
+  RAJA_HOST_DEVICE RAJA_INLINE friend iterator begin(Span& s)
+  {
+    return s.begin();
+  }
   RAJA_HOST_DEVICE RAJA_INLINE friend iterator end(Span& s) { return s.end(); }
-  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator begin(const Span& s) { return s.begin(); }
-  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator end(const Span& s) { return s.end(); }
-  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator cbegin(const Span& s) { return s.cbegin(); }
-  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator cend(const Span& s) { return s.cend(); }
+  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator begin(const Span& s)
+  {
+    return s.begin();
+  }
+  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator end(const Span& s)
+  {
+    return s.end();
+  }
+  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator cbegin(const Span& s)
+  {
+    return s.cbegin();
+  }
+  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator cend(const Span& s)
+  {
+    return s.cend();
+  }
 
   RAJA_HOST_DEVICE RAJA_INLINE reference front() const { return *begin(); }
-  RAJA_HOST_DEVICE RAJA_INLINE reference back() const { return *(end()-1); }
-  RAJA_HOST_DEVICE RAJA_INLINE reference operator[](size_type i) const { return data()[i]; }
+  RAJA_HOST_DEVICE RAJA_INLINE reference back() const { return *(end() - 1); }
+  RAJA_HOST_DEVICE RAJA_INLINE reference operator[](size_type i) const
+  {
+    return data()[i];
+  }
   RAJA_HOST_DEVICE RAJA_INLINE iterator data() const { return m_begin; }
 
   RAJA_HOST_DEVICE RAJA_INLINE size_type size() const
@@ -127,7 +144,7 @@ struct Span {
                                           size_type length) const
   {
     auto start = m_begin + begin;
-    auto end = start + length > m_end ? m_end : start + length;
+    auto end   = start + length > m_end ? m_end : start + length;
     return Span(start, end);
   }
 
@@ -157,21 +174,21 @@ struct Span {
  *
  */
 template <typename IterType, typename IndexType>
-RAJA_HOST_DEVICE RAJA_INLINE Span<IterType, IndexType> make_span(
-    IterType begin,
-    IndexType size)
+RAJA_HOST_DEVICE RAJA_INLINE Span<IterType, IndexType> make_span(IterType begin,
+                                                                 IndexType size)
 {
   return Span<IterType, IndexType>(begin, size);
 }
 
 template <typename Iter>
-RAJA_INLINE auto make_span(Iter &iterable)
+RAJA_INLINE auto make_span(Iter& iterable)
 {
   using std::begin;
-  using std::end;
   using std::distance;
-  return Span<typename Iter::iterator, decltype(distance(begin(iterable), end(iterable)))>
-    (begin(iterable), end(iterable));
+  using std::end;
+  return Span<typename Iter::iterator,
+              decltype(distance(begin(iterable), end(iterable)))>(
+      begin(iterable), end(iterable));
 }
 
 }  // end namespace RAJA
diff --git a/include/RAJA/util/StaticLayout.hpp b/include/RAJA/util/StaticLayout.hpp
index 8d27980f83..df70092459 100644
--- a/include/RAJA/util/StaticLayout.hpp
+++ b/include/RAJA/util/StaticLayout.hpp
@@ -32,7 +32,6 @@
 #include "RAJA/util/Permutations.hpp"
 
 
-
 namespace RAJA
 {
 
@@ -40,7 +39,11 @@ namespace detail
 {
 
 
-template <typename IdxLin, typename Range, typename Sizes, typename Strides, typename DimTypeList=void>
+template <typename IdxLin,
+          typename Range,
+          typename Sizes,
+          typename Strides,
+          typename DimTypeList = void>
 struct StaticLayoutBase_impl;
 
 
@@ -52,15 +55,16 @@ struct StaticLayoutBase_impl<IdxLin,
                              camp::int_seq<IdxLin, RangeInts...>,
                              camp::int_seq<IdxLin, Sizes...>,
                              camp::int_seq<IdxLin, Strides...>,
-                             void> {
+                             void>
+{
 
   using IndexLinear = IdxLin;
-  using sizes = camp::int_seq<IdxLin, Sizes...>;
-  using strides = camp::int_seq<IdxLin, Strides...>;
+  using sizes       = camp::int_seq<IdxLin, Sizes...>;
+  using strides     = camp::int_seq<IdxLin, Strides...>;
 
-  static constexpr camp::idx_t stride_one_dim =
-      RAJA::max<camp::idx_t>(
-          (camp::seq_at<RangeInts, strides>::value == 1 ? camp::idx_t(RangeInts) : -1)...);
+  static constexpr camp::idx_t stride_one_dim = RAJA::max<camp::idx_t>(
+      (camp::seq_at<RangeInts, strides>::value == 1 ? camp::idx_t(RangeInts)
+                                                    : -1)...);
 
   static constexpr size_t n_dims = sizeof...(Sizes);
 
@@ -72,9 +76,7 @@ struct StaticLayoutBase_impl<IdxLin,
   RAJA_INLINE static void print()
   {
     camp::sink(printf("StaticLayout: arg%d: size=%d, stride=%d\n",
-                               (int)RangeInts,
-                               (int)Sizes,
-                               (int)Strides)...);
+                      (int)RangeInts, (int)Sizes, (int)Strides)...);
   }
 
 
@@ -86,8 +88,8 @@ struct StaticLayoutBase_impl<IdxLin,
    * @return Linear space index.
    */
   template <typename... Indices>
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(
-      Indices... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin
+  operator()(Indices... indices) const
   {
     // dot product of strides and indices
     return RAJA::sum<IdxLin>((IdxLin(indices * Strides))...);
@@ -95,7 +97,8 @@ struct StaticLayoutBase_impl<IdxLin,
 
 
   template <typename... Indices>
-  static RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin s_oper(Indices... indices)
+  static RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin
+  s_oper(Indices... indices)
   {
     // dot product of strides and indices
     return RAJA::sum<IdxLin>((IdxLin(indices * Strides))...);
@@ -108,8 +111,7 @@ struct StaticLayoutBase_impl<IdxLin,
       RAJA::product<IdxLin>((Sizes == IdxLin(0) ? IdxLin(1) : Sizes)...);
 
   // Multiply together all of the sizes
-  static constexpr IdxLin s_size_noproj =
-      RAJA::product<IdxLin>(Sizes...);
+  static constexpr IdxLin s_size_noproj = RAJA::product<IdxLin>(Sizes...);
 
   /*!
    * Computes a size of the layout's space with projections as size 1.
@@ -137,37 +139,31 @@ struct StaticLayoutBase_impl<IdxLin,
   }
 
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_stride() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_stride() const
+  {
     return camp::seq_at<DIM, strides>::value;
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_size() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_size() const
+  {
     return camp::seq_at<DIM, sizes>::value;
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_begin() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_begin() const
+  {
     return 0;
   }
-
 };
 
 template <typename IdxLin, IdxLin N, IdxLin Idx, IdxLin... Sizes>
-struct StrideCalculatorIdx {
+struct StrideCalculatorIdx
+{
   static_assert(N == sizeof...(Sizes), "");
 
-  using sizes_seq = camp::int_seq<IdxLin, Sizes...>;
+  using sizes_seq              = camp::int_seq<IdxLin, Sizes...>;
   static constexpr IdxLin size = camp::seq_at<Idx, sizes_seq>::value;
   static constexpr IdxLin size_last =
       StrideCalculatorIdx<IdxLin, N, Idx + 1, Sizes...>::size;
@@ -178,38 +174,49 @@ struct StrideCalculatorIdx {
 };
 
 template <typename IdxLin, IdxLin N, IdxLin... Sizes>
-struct StrideCalculatorIdx<IdxLin, N, N, Sizes...> {
+struct StrideCalculatorIdx<IdxLin, N, N, Sizes...>
+{
   static_assert(N == sizeof...(Sizes), "");
 
-  static constexpr IdxLin size = 1;
-  static constexpr IdxLin value = 1;
+  static constexpr IdxLin size   = 1;
+  static constexpr IdxLin value  = 1;
   static constexpr IdxLin stride = size > 0 ? value : 0;
 };
 
 template <typename IdxLin, typename Range, typename Perm, typename Sizes>
 struct StrideCalculator;
 
-template <typename IdxLin, IdxLin ... Range, camp::idx_t... Perm, IdxLin... Sizes>
+template <typename IdxLin,
+          IdxLin... Range,
+          camp::idx_t... Perm,
+          IdxLin... Sizes>
 struct StrideCalculator<IdxLin,
                         camp::int_seq<IdxLin, Range...>,
                         camp::idx_seq<Perm...>,
-                        camp::int_seq<IdxLin, Sizes...>> {
+                        camp::int_seq<IdxLin, Sizes...>>
+{
   static_assert(sizeof...(Sizes) == sizeof...(Perm), "");
 
-  using sizes = camp::int_seq<IdxLin, Sizes...>;
+  using sizes               = camp::int_seq<IdxLin, Sizes...>;
   static constexpr IdxLin N = sizeof...(Sizes);
-  using range = camp::int_seq<IdxLin, Range...>;
-  using perm = camp::idx_seq<Perm...>;
-  using inv_perm = invert_permutation<perm>;
-
-  using strides_unperm =
-      camp::int_seq<IdxLin, StrideCalculatorIdx<IdxLin, N, Range, camp::seq_at<Perm, sizes>::value...>::stride...>;
-
-  using strides = camp::int_seq<IdxLin, camp::seq_at<camp::seq_at<Range, inv_perm>::value, strides_unperm>::value...>;
+  using range               = camp::int_seq<IdxLin, Range...>;
+  using perm                = camp::idx_seq<Perm...>;
+  using inv_perm            = invert_permutation<perm>;
+
+  using strides_unperm = camp::int_seq<
+      IdxLin,
+      StrideCalculatorIdx<IdxLin,
+                          N,
+                          Range,
+                          camp::seq_at<Perm, sizes>::value...>::stride...>;
+
+  using strides =
+      camp::int_seq<IdxLin,
+                    camp::seq_at<camp::seq_at<Range, inv_perm>::value,
+                                 strides_unperm>::value...>;
 };
 
 
-
 template <typename IdxLin,
           IdxLin... RangeInts,
           IdxLin... Sizes,
@@ -219,19 +226,19 @@ struct StaticLayoutBase_impl<IdxLin,
                              camp::int_seq<IdxLin, RangeInts...>,
                              camp::int_seq<IdxLin, Sizes...>,
                              camp::int_seq<IdxLin, Strides...>,
-                             camp::list<DimTypes...>> {
+                             camp::list<DimTypes...>>
+{
 
 
   using IndexLinear = IdxLin;
   using ranges      = camp::int_seq<IdxLin, RangeInts...>;
   using sizes       = camp::int_seq<IdxLin, Sizes...>;
-  using strides     = camp::int_seq<IdxLin, Strides...>;  
+  using strides     = camp::int_seq<IdxLin, Strides...>;
 
-  using InnerLayout = StaticLayoutBase_impl<IdxLin,ranges,sizes,strides,void>;
+  using InnerLayout =
+      StaticLayoutBase_impl<IdxLin, ranges, sizes, strides, void>;
 
-  static
-  constexpr
-  camp::idx_t stride_one_dim = InnerLayout::stride_one_dim;
+  static constexpr camp::idx_t stride_one_dim = InnerLayout::stride_one_dim;
 
   static constexpr IndexLinear n_dims = sizeof...(DimTypes);
   /*!
@@ -241,14 +248,14 @@ struct StaticLayoutBase_impl<IdxLin,
    * @param indices  Indices in the n-dimensional space of this layout
    * @return Linear space index.
    */
-  static RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear s_oper(
-      DimTypes... indices)
+  static RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear
+  s_oper(DimTypes... indices)
   {
     return InnerLayout::s_oper(stripIndexType(indices)...);
   }
 
 
-  static constexpr IndexLinear s_size = InnerLayout::s_size;
+  static constexpr IndexLinear s_size        = InnerLayout::s_size;
   static constexpr IndexLinear s_size_noproj = InnerLayout::s_size_noproj;
 
   RAJA_INLINE RAJA_HOST_DEVICE constexpr static IndexLinear size()
@@ -261,49 +268,43 @@ struct StaticLayoutBase_impl<IdxLin,
     return s_size_noproj;
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_stride() const {
-    return InnerLayout{}.get_dim_stride();
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_stride() const
+  {
+    return InnerLayout {}.get_dim_stride();
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_size() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_size() const
+  {
     return camp::seq_at<DIM, sizes>::value;
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_begin() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_begin() const
+  {
     return 0;
   }
 
 
   RAJA_INLINE
   static void print() { InnerLayout::print(); }
-
 };
 
 
-
-
-
-template <typename Perm, typename IdxLin, typename Sizes, typename Indexes, typename TypeList>
+template <typename Perm,
+          typename IdxLin,
+          typename Sizes,
+          typename Indexes,
+          typename TypeList>
 struct StaticLayoutMaker
 {
-  using strides = typename detail::StrideCalculator<IdxLin, Indexes, Perm, Sizes>::strides;
-  using type = StaticLayoutBase_impl<IdxLin, Indexes, Sizes, strides,TypeList>;
+  using strides =
+      typename detail::StrideCalculator<IdxLin, Indexes, Perm, Sizes>::strides;
+  using type = StaticLayoutBase_impl<IdxLin, Indexes, Sizes, strides, TypeList>;
 };
 
 
-
 }  // namespace detail
 
 
@@ -313,20 +314,21 @@ using StaticLayoutT = typename detail::StaticLayoutMaker<
     IdxLin,
     camp::int_seq<IdxLin, Sizes...>,
     camp::make_int_seq_t<IdxLin, sizeof...(Sizes)>,
-    void
-    >::type;
+    void>::type;
 
 template <typename Perm, camp::idx_t... Sizes>
 using StaticLayout = StaticLayoutT<Perm, camp::idx_t, Sizes...>;
 
-template <typename Perm, typename IdxLin, typename TypeList, camp::idx_t... Sizes>
+template <typename Perm,
+          typename IdxLin,
+          typename TypeList,
+          camp::idx_t... Sizes>
 using TypedStaticLayout = typename detail::StaticLayoutMaker<
     Perm,
     IdxLin,
     camp::int_seq<IdxLin, Sizes...>,
     camp::make_int_seq_t<IdxLin, sizeof...(Sizes)>,
-    TypeList
-    >::type;
+    TypeList>::type;
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/util/Timer.hpp b/include/RAJA/util/Timer.hpp
index 8c23a2c74d..6c96a12e23 100644
--- a/include/RAJA/util/Timer.hpp
+++ b/include/RAJA/util/Timer.hpp
@@ -51,7 +51,7 @@ class BGQTimer
   using ElapsedType = double;
 
 private:
-  using TimeType = timeval;
+  using TimeType     = timeval;
   using DurationType = std::chrono::duration<ElapsedType>;
 
 public:
@@ -104,14 +104,13 @@ class ChronoTimer
   using ElapsedType = double;
 
 private:
-  using ClockType = std::chrono::steady_clock;
-  using TimeType = ClockType::time_point;
+  using ClockType    = std::chrono::steady_clock;
+  using TimeType     = ClockType::time_point;
   using DurationType = std::chrono::duration<ElapsedType>;
 
 public:
   ChronoTimer() : tstart(ClockType::now()), tstop(ClockType::now()), telapsed(0)
-  {
-  }
+  {}
 
   void start() { tstart = ClockType::now(); }
 
@@ -174,7 +173,7 @@ class GettimeTimer
 
   void reset()
   {
-    stime_elapsed = 0;
+    stime_elapsed  = 0;
     nstime_elapsed = 0;
   }
 
diff --git a/include/RAJA/util/TypeConvert.hpp b/include/RAJA/util/TypeConvert.hpp
index 5cdc019259..9b34eb5e71 100644
--- a/include/RAJA/util/TypeConvert.hpp
+++ b/include/RAJA/util/TypeConvert.hpp
@@ -39,7 +39,7 @@ namespace util
  * Reinterpret any datatype as another datatype of the same size
  */
 template <typename A, typename B>
-RAJA_INLINE RAJA_HOST_DEVICE constexpr B reinterp_A_as_B(A const &a)
+RAJA_INLINE RAJA_HOST_DEVICE constexpr B reinterp_A_as_B(A const& a)
 {
   static_assert(sizeof(A) == sizeof(B), "A and B must be the same size");
 
diff --git a/include/RAJA/util/TypedViewBase.hpp b/include/RAJA/util/TypedViewBase.hpp
index 0d5bed35d6..9f7563d729 100644
--- a/include/RAJA/util/TypedViewBase.hpp
+++ b/include/RAJA/util/TypedViewBase.hpp
@@ -38,705 +38,786 @@ namespace RAJA
 namespace internal
 {
 
-  template<camp::idx_t, typename T>
-  struct IndexToType{
-      using type = T;
-  };
+template <camp::idx_t, typename T>
+struct IndexToType
+{
+  using type = T;
+};
 
-  template<typename IdxSeq, typename T>
-  struct SequenceToType;
+template <typename IdxSeq, typename T>
+struct SequenceToType;
 
-  template<camp::idx_t ... Perm, typename T>
-  struct SequenceToType<camp::idx_seq<Perm...>, T>{
-      using type =  camp::list<typename IndexToType<Perm, T>::type...>;
-  };
+template <camp::idx_t... Perm, typename T>
+struct SequenceToType<camp::idx_seq<Perm...>, T>
+{
+  using type = camp::list<typename IndexToType<Perm, T>::type...>;
+};
 
-  template<typename Perm>
-  using getDefaultIndexTypes = typename SequenceToType<Perm, RAJA::Index_type>::type;
+template <typename Perm>
+using getDefaultIndexTypes =
+    typename SequenceToType<Perm, RAJA::Index_type>::type;
 
 
+// Helpers to convert
+// layouts -> OffsetLayouts
+// Typedlayouts -> TypedOffsetLayouts
+template <typename layout>
+struct add_offset
+{
+  using type = RAJA::OffsetLayout<layout::n_dims>;
+};
 
+template <typename IdxLin, typename... DimTypes>
+struct add_offset<RAJA::TypedLayout<IdxLin, camp::tuple<DimTypes...>>>
+{
+  using type = RAJA::TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>;
+};
 
-  //Helpers to convert
-  //layouts -> OffsetLayouts
-  //Typedlayouts -> TypedOffsetLayouts
-  template<typename layout>
-  struct add_offset
-  {
-    using type = RAJA::OffsetLayout<layout::n_dims>;
-  };
 
-  template<typename IdxLin, typename...DimTypes>
-  struct add_offset<RAJA::TypedLayout<IdxLin,camp::tuple<DimTypes...>>>
-  {
-    using type = RAJA::TypedOffsetLayout<IdxLin,camp::tuple<DimTypes...>>;
-  };
+#if defined(RAJA_ENABLE_VECTORIZATION)
+namespace detail
+{
+/*
+ * Returns the argument number which contains a VectorIndex
+ *
+ * returns -1 if none of the arguments are VectorIndexs
+ */
 
+template <camp::idx_t DIM, typename ARGS, typename IDX_SEQ>
+struct GetTensorArgIdxExpanded;
 
+template <camp::idx_t DIM, typename... ARGS, camp::idx_t... IDX>
+struct GetTensorArgIdxExpanded<DIM, camp::list<ARGS...>, camp::idx_seq<IDX...>>
+{
 
+  static constexpr camp::idx_t value = RAJA::max<camp::idx_t>(
+      (internal::expt::isTensorIndex<ARGS>() &&
+               internal::expt::getTensorDim<ARGS>() == DIM
+           ? IDX
+           : -1)...);
+};
 
-#if defined(RAJA_ENABLE_VECTORIZATION)
-  namespace detail
-  {
-    /*
-     * Returns the argument number which contains a VectorIndex
-     *
-     * returns -1 if none of the arguments are VectorIndexs
-     */
 
-    template<camp::idx_t DIM, typename ARGS, typename IDX_SEQ>
-    struct GetTensorArgIdxExpanded;
+}  // namespace detail
+#endif
 
-    template<camp::idx_t DIM, typename ... ARGS, camp::idx_t ... IDX>
-    struct GetTensorArgIdxExpanded<DIM, camp::list<ARGS...>, camp::idx_seq<IDX...>> {
 
-        static constexpr camp::idx_t value =
-            RAJA::max<camp::idx_t>(
-                (internal::expt::isTensorIndex<ARGS>()&&internal::expt::getTensorDim<ARGS>()==DIM ? IDX : -1) ...);
-    };
+/*
+ * Returns the number of arguments which are VectorIndexs
+ */
+template <typename... ARGS>
+struct count_num_tensor_args
+{
+  static constexpr camp::idx_t value =
+#if defined(RAJA_ENABLE_VECTORIZATION)
+      RAJA::sum<camp::idx_t>(
+          (internal::expt::isTensorIndex<ARGS>() ? 1 : 0)...);
+#else
+      0;  // There should be 0 Tensor indices if not vectorizing.
+#endif
+};
 
+#if defined(RAJA_ENABLE_VECTORIZATION)
+/*
+ * Returns which argument has a vector index
+ */
+template <camp::idx_t DIM, typename... ARGS>
+struct GetTensorArgIdx
+{
+  static constexpr camp::idx_t value = detail::GetTensorArgIdxExpanded<
+      DIM,
+      camp::list<ARGS...>,
+      camp::make_idx_seq_t<sizeof...(ARGS)>>::value;
+};
 
-  } // namespace detail
+template <camp::idx_t DIM, typename... ARGS>
+struct GetTensorArgIdx<DIM, camp::list<ARGS...>>
+{
+  static constexpr camp::idx_t value = detail::GetTensorArgIdxExpanded<
+      DIM,
+      camp::list<ARGS...>,
+      camp::make_idx_seq_t<sizeof...(ARGS)>>::value;
+};
+
+/*
+ * Returns the beginning index in a vector argument
+ */
+template <camp::idx_t DIM, typename LAYOUT, typename... ARGS>
+RAJA_INLINE RAJA_HOST_DEVICE static constexpr camp::idx_t
+get_tensor_args_begin(LAYOUT const& layout, ARGS... args)
+{
+  return RAJA::max<camp::idx_t>(
+      internal::expt::getTensorDim<ARGS>() == DIM
+          ? internal::expt::getTensorBegin<ARGS>(
+                args, layout.template get_dim_begin<
+                          GetTensorArgIdx<DIM, ARGS...>::value>())
+          : 0 ...);
+}
+
+/*
+ * Returns the number of elements in the vector argument
+ */
+template <camp::idx_t DIM, typename LAYOUT, typename... ARGS>
+RAJA_INLINE RAJA_HOST_DEVICE static constexpr camp::idx_t
+get_tensor_args_size(LAYOUT const& layout, ARGS... args)
+{
+  return RAJA::max<camp::idx_t>(
+      internal::expt::getTensorDim<ARGS>() == DIM
+          ? internal::expt::getTensorSize<ARGS>(
+                args, layout.template get_dim_size<
+                          GetTensorArgIdx<DIM, ARGS...>::value>())
+          : 0 ...);
+}
 #endif
 
 
+namespace detail
+{
+
+/*!
+ * Provides conversion of view data to a return type.
+ *
+ * For scalars, this just returns the scalar.
+ *
+ * In the future development, this may return SIMD vectors or matrices using
+ * class specializations.
+ */
+template <typename VecSeq,
+          typename Args,
+          typename ElementType,
+          typename PointerType,
+          typename LinIdx,
+          typename LayoutType>
+struct ViewReturnHelper;
+
+
+/*
+ * Specialization for Scalar return types
+ */
+template <typename... Args,
+          typename ElementType,
+          typename PointerType,
+          typename LinIdx,
+          typename LayoutType>
+struct ViewReturnHelper<camp::idx_seq<>,
+                        camp::list<Args...>,
+                        ElementType,
+                        PointerType,
+                        LinIdx,
+                        LayoutType>
+{
+  using return_type = ElementType&;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr return_type make_return(LayoutType const& layout,
+                                           PointerType const& data,
+                                           Args const&... args)
+  {
+    return data[stripIndexType(layout(args...))];
+  }
+};
+
 
-  /*
-   * Returns the number of arguments which are VectorIndexs
-   */
-  template<typename ... ARGS>
-  struct count_num_tensor_args{
-    static constexpr camp::idx_t value =
-#if defined(RAJA_ENABLE_VECTORIZATION)
-        RAJA::sum<camp::idx_t>(
-            (internal::expt::isTensorIndex<ARGS>() ? 1 : 0) ...);
-#else
-        0;  // There should be 0 Tensor indices if not vectorizing.
-#endif
-  };
-  
 #if defined(RAJA_ENABLE_VECTORIZATION)
-  /*
-   * Returns which argument has a vector index
-   */
-  template<camp::idx_t DIM, typename ... ARGS>
-  struct GetTensorArgIdx{
-      static constexpr camp::idx_t value =
-          detail::GetTensorArgIdxExpanded<DIM, camp::list<ARGS...>, camp::make_idx_seq_t<sizeof...(ARGS)> >:: value;
-  };
-
-  template<camp::idx_t DIM, typename ... ARGS>
-  struct GetTensorArgIdx<DIM,camp::list<ARGS...>>{
-      static constexpr camp::idx_t value =
-          detail::GetTensorArgIdxExpanded<DIM, camp::list<ARGS...>, camp::make_idx_seq_t<sizeof...(ARGS)> >:: value;
-  };
+/*
+ * Specialization for Tensor return types
+ */
+template <camp::idx_t VecHead,
+          camp::idx_t... VecSeq,
+          typename... Args,
+          typename ElementType,
+          typename PointerType,
+          typename LinIdx,
+          typename LayoutType>
+struct ViewReturnHelper<camp::idx_seq<VecHead, VecSeq...>,
+                        camp::list<Args...>,
+                        ElementType,
+                        PointerType,
+                        LinIdx,
+                        LayoutType>
+{
+
+  static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
+
+  // This is the stride-one dimensions w.r.t. the tensor not the View
+  // For example:
+  //  For a vector, s_stride_one_dim is either 0 (packed) or -1 (strided)
+  //  For a matrix, s_stride_one_dim is either:
+  //                 -1 neither row nor column are packed
+  //                 0 rows are stride-one
+  //                 1 columns are stride-one
+  static constexpr camp::idx_t s_stride_one_dim = RAJA::max<camp::idx_t>(
+      (GetTensorArgIdx<VecHead, Args...>::value == LayoutType::stride_one_dim
+           ? VecHead
+           : -1),
+      (GetTensorArgIdx<VecSeq, Args...>::value == LayoutType::stride_one_dim
+           ? VecSeq
+           : -1)...);
+
+
+  using tensor_reg_type =
+      typename camp::at_v<camp::list<Args...>,
+                          GetTensorArgIdx<0, Args...>::value>::tensor_type;
+  using ref_type = internal::expt::TensorRef<ElementType*,
+                                             LinIdx,
+                                             internal::expt::TENSOR_MULTIPLE,
+                                             s_num_dims,
+                                             s_stride_one_dim>;
+  using return_type =
+      internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
 
-  /*
-   * Returns the beginning index in a vector argument
-   */
-  template<camp::idx_t DIM, typename LAYOUT, typename ... ARGS>
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static constexpr camp::idx_t get_tensor_args_begin(LAYOUT const &layout, ARGS ... args){
-    return RAJA::max<camp::idx_t>(
-        internal::expt::getTensorDim<ARGS>()==DIM
-        ? internal::expt::getTensorBegin<ARGS>(args, layout.template get_dim_begin<GetTensorArgIdx<DIM, ARGS...>::value>())
-        : 0 ...);
+  static constexpr return_type make_return(LayoutType const& layout,
+                                           PointerType const& data,
+                                           Args const&... args)
+  {
+
+    return return_type(ref_type {
+        // data pointer
+        &data[0] +
+            layout(internal::expt::isTensorIndex<Args>()
+                       ? LinIdx {0}
+                       : (LinIdx)stripIndexType(
+                             internal::expt::stripTensorIndexByValue(args))...),
+        // strides
+        {(LinIdx)layout.template get_dim_stride<
+             GetTensorArgIdx<VecHead, Args...>::value>(),
+         (LinIdx)layout.template get_dim_stride<
+             GetTensorArgIdx<VecSeq, Args...>::value>()...},
+        // tile
+        {// begin
+         {(LinIdx)(get_tensor_args_begin<VecHead>(layout, args...)),
+          (LinIdx)(get_tensor_args_begin<VecSeq>(layout, args...))...},
+
+         // size
+         {(LinIdx)get_tensor_args_size<VecHead>(layout, args...),
+          (LinIdx)get_tensor_args_size<VecSeq>(layout, args...)...}}});
   }
+};
+
+
+/*
+ * Specialization for Tensor return types and static layout types
+ */
+template <camp::idx_t VecHead,
+          camp::idx_t... VecSeq,
+          typename... INDEX_TYPES,
+          typename ElementType,
+          typename PointerType,
+          typename LinIdx,
+          LinIdx... RangeInts,
+          LinIdx... SizeInts,
+          LinIdx... StrideInts,
+          typename DIM_LIST>
+struct ViewReturnHelper<
+    camp::idx_seq<VecHead, VecSeq...>,
+    camp::list<RAJA::expt::StaticTensorIndex<INDEX_TYPES>...>,
+    ElementType,
+    PointerType,
+    LinIdx,
+    RAJA::detail::StaticLayoutBase_impl<LinIdx,
+                                        camp::int_seq<LinIdx, RangeInts...>,
+                                        camp::int_seq<LinIdx, SizeInts...>,
+                                        camp::int_seq<LinIdx, StrideInts...>,
+                                        DIM_LIST>>
+{
+  static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
+
+  using index_list = camp::list<RAJA::expt::StaticTensorIndex<INDEX_TYPES>...>;
+
+  using range_seq  = camp::int_seq<LinIdx, RangeInts...>;
+  using size_seq   = camp::int_seq<LinIdx, SizeInts...>;
+  using stride_seq = camp::int_seq<LinIdx, StrideInts...>;
+  using LayoutType = RAJA::detail::
+      StaticLayoutBase_impl<LinIdx, range_seq, size_seq, stride_seq, DIM_LIST>;
+
+  // This is the stride-one dimensions w.r.t. the tensor not the View
+  // For example:
+  //  For a vector, s_stride_one_dim is either 0 (packed) or -1 (strided)
+  //  For a matrix, s_stride_one_dim is either:
+  //                 -1 neither row nor column are packed
+  //                 0 rows are stride-one
+  //                 1 columns are stride-one
+  static constexpr camp::idx_t s_stride_one_dim = RAJA::max<camp::idx_t>(
+      (GetTensorArgIdx<VecHead, index_list>::value == LayoutType::stride_one_dim
+           ? VecHead
+           : -1),
+      (GetTensorArgIdx<VecSeq, index_list>::value == LayoutType::stride_one_dim
+           ? VecSeq
+           : -1)...);
+
+
+  using new_begin_seq =
+      camp::int_seq<LinIdx,
+                    (LinIdx)get_tensor_args_begin<VecHead>(
+                        LayoutType(),
+                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
+                    (LinIdx)get_tensor_args_begin<VecSeq>(
+                        LayoutType(),
+                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...>;
+  using new_size_seq =
+      camp::int_seq<LinIdx,
+                    (LinIdx)get_tensor_args_size<VecHead>(
+                        LayoutType(),
+                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
+                    (LinIdx)get_tensor_args_size<VecSeq>(
+                        LayoutType(),
+                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...>;
+
+  using new_begin_type = internal::expt::StaticIndexArray<new_begin_seq>;
+  using new_size_type  = internal::expt::StaticIndexArray<new_size_seq>;
+
+
+  using tensor_reg_type =
+      typename camp::at_v<index_list,
+                          GetTensorArgIdx<0, index_list>::value>::tensor_type;
+  using ref_type =
+      internal::expt::StaticTensorRef<ElementType*,
+                                      LinIdx,
+                                      internal::expt::TENSOR_MULTIPLE,
+                                      stride_seq,
+                                      new_begin_seq,
+                                      new_size_seq,
+                                      s_stride_one_dim>;
+  using return_type =
+      internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
+
 
-  /*
-   * Returns the number of elements in the vector argument
-   */
-  template<camp::idx_t DIM, typename LAYOUT, typename ... ARGS>
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static constexpr camp::idx_t get_tensor_args_size(LAYOUT const &layout, ARGS ... args){
-    return RAJA::max<camp::idx_t>(
-        internal::expt::getTensorDim<ARGS>()==DIM
-        ? internal::expt::getTensorSize<ARGS>(args, layout.template get_dim_size<GetTensorArgIdx<DIM, ARGS...>::value>())
-        : 0 ...);
+  static constexpr return_type
+  make_return(LayoutType const& layout,
+              PointerType const& data,
+              RAJA::expt::StaticTensorIndex<INDEX_TYPES> const&... args)
+  {
+
+    return return_type(ref_type {
+        // data pointer
+        &data[0] +
+            layout(internal::expt::isTensorIndex<
+                       typename RAJA::expt::StaticTensorIndex<
+                           INDEX_TYPES>::base_type>()
+                       ? LinIdx {0}
+                       : (LinIdx)stripIndexType(
+                             internal::expt::stripTensorIndexByValue(args))...),
+        // strides
+        typename ref_type::stride_type(),
+        // tile
+        {new_begin_type(), new_size_type()}});
   }
+};
 #endif
 
 
-  namespace detail {
+}  // namespace detail
 
-  /*!
-   * Provides conversion of view data to a return type.
-   *
-   * For scalars, this just returns the scalar.
-   *
-   * In the future development, this may return SIMD vectors or matrices using
-   * class specializations.
-   */
-  template<typename VecSeq, typename Args, typename ElementType, typename PointerType, typename LinIdx, typename LayoutType>
-  struct ViewReturnHelper;
 
+/*
+ * Computes the return type of a view.
+ *
+ * If any of the arguments are a VectorIndex, it creates a VectorRef
+ * return type.
+ *
+ * Otherwise it produces the usual scalar reference return type
+ */
+template <typename ElementType,
+          typename PointerType,
+          typename LinIdx,
+          typename LayoutType,
+          typename... Args>
+using view_return_type_t = typename detail::ViewReturnHelper<
+    camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
+    camp::list<Args...>,
+    ElementType,
+    PointerType,
+    LinIdx,
+    LayoutType>::return_type;
+
+/*
+ * Creates the return value for a View
+ *
+ * If any of the arguments are a VectorIndex, it creates a VectorRef
+ * return value.
+ *
+ * Otherwise it produces the usual scalar reference return value
+ */
+template <typename ElementType,
+          typename LinIdx,
+          typename LayoutType,
+          typename PointerType,
+          typename... Args>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr view_return_type_t<ElementType,
+                                                          PointerType,
+                                                          LinIdx,
+                                                          LayoutType,
+                                                          Args...>
+view_make_return_value(LayoutType const& layout,
+                       PointerType const& data,
+                       Args const&... args)
+{
+  return detail::ViewReturnHelper<
+      camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
+      camp::list<Args...>, ElementType, PointerType, LinIdx,
+      LayoutType>::make_return(layout, data, args...);
+}
 
-  /*
-   * Specialization for Scalar return types
-   */
-  template<typename ... Args, typename ElementType, typename PointerType, typename LinIdx, typename LayoutType>
-  struct ViewReturnHelper<camp::idx_seq<>, camp::list<Args...>, ElementType, PointerType, LinIdx, LayoutType>
-  {
-      using return_type = ElementType &;
+namespace detail
+{
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      return_type make_return(LayoutType const &layout, PointerType const &data, Args const &... args){
-        return data[stripIndexType(layout(args...))];
-      }
-  };
+/**
+ * This class will help strip strongly typed indices
+ *
+ * This default implementation static_asserts that Expected==Arg, otherwise
+ * it's an error.  This enforces types for the TypedView.
+ *
+ * Specialization where expected type is same as argument type.
+ * In this case, there is no VectorIndex to unpack, just strip any strongly
+ * typed indices.
+ */
+template <typename Expected, typename Arg>
+struct MatchTypedViewArgHelper
+{
+  static_assert(std::is_convertible<strip_index_type_t<Arg>,
+                                    strip_index_type_t<Expected>>::value,
+                "Argument isn't compatible");
 
+  using type = strip_index_type_t<Arg>;
 
-#if defined(RAJA_ENABLE_VECTORIZATION)
-  /*
-   * Specialization for Tensor return types
-   */
-  template<camp::idx_t VecHead, camp::idx_t ... VecSeq, typename ... Args, typename ElementType, typename PointerType, typename LinIdx, typename LayoutType>
-  struct ViewReturnHelper<camp::idx_seq<VecHead,VecSeq...>, camp::list<Args...>, ElementType, PointerType, LinIdx, LayoutType>
+  static RAJA_HOST_DEVICE RAJA_INLINE constexpr type extract(Arg arg)
   {
+    return stripIndexType(arg);
+  }
+};
 
-      static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
-
-      // This is the stride-one dimensions w.r.t. the tensor not the View
-      // For example:
-      //  For a vector, s_stride_one_dim is either 0 (packed) or -1 (strided)
-      //  For a matrix, s_stride_one_dim is either:
-      //                 -1 neither row nor column are packed
-      //                 0 rows are stride-one
-      //                 1 columns are stride-one
-      static constexpr camp::idx_t s_stride_one_dim =
-          RAJA::max<camp::idx_t>(
-                  (GetTensorArgIdx<VecHead,Args...>::value == LayoutType::stride_one_dim ? VecHead : -1 ),
-                  (GetTensorArgIdx<VecSeq, Args...>::value == LayoutType::stride_one_dim ? VecSeq  : -1 )...
-          );
-
-
-      using tensor_reg_type = typename camp::at_v<camp::list<Args...>, GetTensorArgIdx<0, Args...>::value>::tensor_type;
-      using ref_type = internal::expt::TensorRef<ElementType*, LinIdx, internal::expt::TENSOR_MULTIPLE, s_num_dims, s_stride_one_dim>;
-      using return_type = internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      return_type make_return(LayoutType const &layout, PointerType const &data, Args const &... args){
-
-        return return_type(ref_type{
-          // data pointer
-          &data[0] + layout(internal::expt::isTensorIndex<Args>() ? LinIdx{0} : (LinIdx)stripIndexType(internal::expt::stripTensorIndexByValue(args))...),
-          // strides
-          {
-              (LinIdx)layout.template get_dim_stride<GetTensorArgIdx<VecHead,Args...>::value>(),
-              (LinIdx)layout.template get_dim_stride<GetTensorArgIdx<VecSeq, Args...>::value>()...
-          },
-          // tile
-          {
-              // begin
-              {
-                  (LinIdx)(get_tensor_args_begin<VecHead>(layout, args...)),
-                  (LinIdx)(get_tensor_args_begin<VecSeq> (layout, args...))...
-              },
-
-              // size
-              {
-                  (LinIdx)get_tensor_args_size<VecHead>(layout, args...),
-                  (LinIdx)get_tensor_args_size<VecSeq> (layout, args...)...
-              }
-          }
-        });
-      }
-  };
 
+#if defined(RAJA_ENABLE_VECTORIZATION)
+/**
+ * Specialization where expected type is wrapped in a VectorIndex type
+ *
+ * In this case, there is no VectorIndex to unpack, just strip any strongly
+ * typed indices.
+ */
+template <typename Expected, typename Arg, typename VectorType, camp::idx_t DIM>
+struct MatchTypedViewArgHelper<Expected,
+                               RAJA::expt::TensorIndex<Arg, VectorType, DIM>>
+{
 
+  static_assert(std::is_convertible<strip_index_type_t<Arg>,
+                                    strip_index_type_t<Expected>>::value,
+                "Argument isn't compatible");
 
+  using arg_type = strip_index_type_t<Arg>;
 
+  using type = RAJA::expt::TensorIndex<arg_type, VectorType, DIM>;
 
-  /*
-   * Specialization for Tensor return types and static layout types
-   */
-  template<
-      camp::idx_t VecHead, camp::idx_t ... VecSeq,
-      typename ... INDEX_TYPES,
-      typename ElementType, typename PointerType, typename LinIdx,
-      LinIdx... RangeInts, LinIdx... SizeInts, LinIdx... StrideInts,
-      typename DIM_LIST
-  >
-  struct ViewReturnHelper<
-      camp::idx_seq<VecHead,VecSeq...>,
-      camp::list<RAJA::expt::StaticTensorIndex<INDEX_TYPES>...>,
-      ElementType, PointerType,
-      LinIdx,
-      RAJA::detail::StaticLayoutBase_impl<
-          LinIdx,
-          camp::int_seq<LinIdx,RangeInts...>,
-          camp::int_seq<LinIdx,SizeInts...>,
-          camp::int_seq<LinIdx,StrideInts...>,
-          DIM_LIST
-      >
-  > {
-      static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
-
-      using index_list = camp::list<RAJA::expt::StaticTensorIndex<INDEX_TYPES>...>;
-
-      using range_seq  = camp::int_seq<LinIdx,RangeInts... >;
-      using size_seq   = camp::int_seq<LinIdx,SizeInts...  >;
-      using stride_seq = camp::int_seq<LinIdx,StrideInts...>;
-      using LayoutType = RAJA::detail::StaticLayoutBase_impl<LinIdx,range_seq,size_seq,stride_seq,DIM_LIST>;
-
-      // This is the stride-one dimensions w.r.t. the tensor not the View
-      // For example:
-      //  For a vector, s_stride_one_dim is either 0 (packed) or -1 (strided)
-      //  For a matrix, s_stride_one_dim is either:
-      //                 -1 neither row nor column are packed
-      //                 0 rows are stride-one
-      //                 1 columns are stride-one
-      static constexpr camp::idx_t s_stride_one_dim =
-          RAJA::max<camp::idx_t>(
-                  (GetTensorArgIdx<VecHead,index_list>::value == LayoutType::stride_one_dim ? VecHead : -1 ),
-                  (GetTensorArgIdx<VecSeq, index_list>::value == LayoutType::stride_one_dim ? VecSeq  : -1 )...
-          );
-
-
-
-
-      using new_begin_seq = camp::int_seq<
-                LinIdx,
-                (LinIdx)get_tensor_args_begin<VecHead>(LayoutType(), RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
-                (LinIdx)get_tensor_args_begin<VecSeq >(LayoutType(), RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...
-            >;
-      using new_size_seq  = camp::int_seq<
-                LinIdx,
-                (LinIdx)get_tensor_args_size <VecHead>(LayoutType(), RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
-                (LinIdx)get_tensor_args_size <VecSeq >(LayoutType(), RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...
-            >;
-
-      using new_begin_type = internal::expt::StaticIndexArray<new_begin_seq>;
-      using new_size_type  = internal::expt::StaticIndexArray<new_size_seq >;
-
-
-      using tensor_reg_type = typename camp::at_v<index_list, GetTensorArgIdx<0, index_list>::value>::tensor_type;
-      using ref_type = internal::expt::StaticTensorRef<ElementType*, LinIdx, internal::expt::TENSOR_MULTIPLE,stride_seq,new_begin_seq,new_size_seq, s_stride_one_dim>;
-      using return_type = internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      return_type make_return(LayoutType const &layout, PointerType const &data, RAJA::expt::StaticTensorIndex<INDEX_TYPES> const &... args){
-
-        return return_type(ref_type{
-          // data pointer
-          &data[0] + layout(internal::expt::isTensorIndex<typename RAJA::expt::StaticTensorIndex<INDEX_TYPES>::base_type>() ? LinIdx{0} : (LinIdx)stripIndexType(internal::expt::stripTensorIndexByValue(args))...),
-          // strides
-          typename ref_type::stride_type(),
-          // tile
-          {
-              new_begin_type(),
-              new_size_type()
-          }
-        });
-      }
-  };
-#endif
+  static constexpr RAJA_HOST_DEVICE RAJA_INLINE type
+  extract(RAJA::expt::TensorIndex<Arg, VectorType, DIM> vec_arg)
+  {
+    return type(stripIndexType(*vec_arg), vec_arg.size());
+  }
+};
 
+/**
+ * Specialization where expected type is wrapped in a StaticTensorIndex type
+ *
+ * In this case, there is no StaticTensorIndex to unpack, just strip any
+ * strongly typed indices.
+ */
+template <typename Expected,
+          typename Arg,
+          typename VectorType,
+          camp::idx_t DIM,
+          Arg BEGIN,
+          strip_index_type_t<Arg> LENGTH>
+struct MatchTypedViewArgHelper<
+    Expected,
+    RAJA::expt::StaticTensorIndex<
+        RAJA::expt::
+            StaticTensorIndexInner<Arg, VectorType, DIM, BEGIN, LENGTH>>>
+{
 
-  } // namespace detail
+  static_assert(std::is_convertible<strip_index_type_t<Arg>,
+                                    strip_index_type_t<Expected>>::value,
+                "Argument isn't compatible");
 
+  using arg_type = strip_index_type_t<Arg>;
 
-  /*
-   * Computes the return type of a view.
-   *
-   * If any of the arguments are a VectorIndex, it creates a VectorRef
-   * return type.
-   *
-   * Otherwise it produces the usual scalar reference return type
-   */
-  template<typename ElementType, typename PointerType, typename LinIdx, typename LayoutType, typename ... Args>
-  using view_return_type_t =
-      typename detail::ViewReturnHelper<
-        camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
-        camp::list<Args...>,
-        ElementType,
-        PointerType,
-        LinIdx,
-        LayoutType>::return_type;
+  using type = RAJA::expt::StaticTensorIndex<
+      RAJA::expt::
+          StaticTensorIndexInner<arg_type, VectorType, DIM, BEGIN, LENGTH>>;
 
-  /*
-   * Creates the return value for a View
-   *
-   * If any of the arguments are a VectorIndex, it creates a VectorRef
-   * return value.
-   *
-   * Otherwise it produces the usual scalar reference return value
-   */
-  template<typename ElementType, typename LinIdx, typename LayoutType, typename PointerType, typename ... Args>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  view_return_type_t<ElementType, PointerType, LinIdx, LayoutType, Args...>
-  view_make_return_value(LayoutType const &layout, PointerType const &data, Args const &... args){
-    return detail::ViewReturnHelper<
-        camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
-        camp::list<Args...>,
-        ElementType,
-        PointerType,
-        LinIdx,
-        LayoutType>::make_return(layout, data, args...);
+  static constexpr RAJA_HOST_DEVICE RAJA_INLINE type extract(
+      RAJA::expt::StaticTensorIndex<
+          RAJA::expt::
+              StaticTensorIndexInner<Arg, VectorType, DIM, BEGIN, LENGTH>>
+          RAJA_UNUSED_ARG(vec_arg))
+  {
+    return type();
   }
+};
+#endif
 
-  namespace detail
-  {
+}  // namespace detail
 
-  /**
-   * This class will help strip strongly typed indices
-   *
-   * This default implementation static_asserts that Expected==Arg, otherwise
-   * it's an error.  This enforces types for the TypedView.
-   *
-   * Specialization where expected type is same as argument type.
-   * In this case, there is no VectorIndex to unpack, just strip any strongly
-   * typed indices.
-   */
-  template<typename Expected, typename Arg>
-  struct MatchTypedViewArgHelper{
-    static_assert(std::is_convertible<strip_index_type_t<Arg>, strip_index_type_t<Expected>>::value,
-        "Argument isn't compatible");
 
-    using type = strip_index_type_t<Arg>;
+template <typename Expected, typename Arg>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr
+    typename detail::MatchTypedViewArgHelper<Expected, Arg>::type
+    match_typed_view_arg(Arg const& arg)
+{
+  return detail::MatchTypedViewArgHelper<Expected, Arg>::extract(arg);
+}
+
 
-    static RAJA_HOST_DEVICE RAJA_INLINE
-    constexpr
-    type extract(Arg arg){
-      return stripIndexType(arg);
-    }
-  };
+template <typename ValueType, typename PointerType, typename LayoutType>
+class ViewBase
+{
 
+public:
+  using value_type        = ValueType;
+  using pointer_type      = PointerType;
+  using layout_type       = LayoutType;
+  using linear_index_type = typename layout_type::IndexLinear;
+  using nc_value_type     = typename std::remove_const<value_type>::type;
+  using nc_pointer_type = typename std::add_pointer<typename std::remove_const<
+      typename std::remove_pointer<pointer_type>::type>::type>::type;
 
-#if defined(RAJA_ENABLE_VECTORIZATION)
-  /**
-   * Specialization where expected type is wrapped in a VectorIndex type
+  using Self         = ViewBase<value_type, pointer_type, layout_type>;
+  using NonConstView = ViewBase<nc_value_type, nc_pointer_type, layout_type>;
+
+  using shifted_layout_type = typename add_offset<layout_type>::type;
+  using ShiftedView = ViewBase<value_type, pointer_type, shifted_layout_type>;
+
+protected:
+  pointer_type m_data;
+  layout_type const m_layout;
+
+public:
+  /*
+   * Defaulted operators (AJK):
+   *
+   * OpenMP Target currently needs the View classes to be trivially copyable,
+   * which means that we need to use the default ctor's and assignment
+   * operators.
    *
-   * In this case, there is no VectorIndex to unpack, just strip any strongly
-   * typed indices.
+   * These defaulted operators cause issues with some versions of CUDA, so
+   * in the case that CUDA is enabled, we switch to explicitly defined
+   * operators.
    */
-  template<typename Expected, typename Arg, typename VectorType, camp::idx_t DIM>
-  struct MatchTypedViewArgHelper<Expected, RAJA::expt::TensorIndex<Arg, VectorType, DIM> >{
+#if (defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_CLANG_CUDA))
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr ViewBase() {};
 
-    static_assert(std::is_convertible<strip_index_type_t<Arg>, strip_index_type_t<Expected>>::value,
-        "Argument isn't compatible");
+  RAJA_HOST_DEVICE
+  RAJA_INLINE ViewBase(ViewBase const& c)
+      : m_layout(c.m_layout), m_data(c.m_data)
+  {}
 
-    using arg_type = strip_index_type_t<Arg>;
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  ViewBase& operator=(ViewBase const& c)
+  {
+    m_layout = c.m_layout;
+    m_data   = c.m_data;
+  }
+#else
+  constexpr ViewBase()                             = default;
+  RAJA_INLINE constexpr ViewBase(ViewBase const&)  = default;
+  RAJA_INLINE constexpr ViewBase(ViewBase&&)       = default;
+  RAJA_INLINE ViewBase& operator=(ViewBase const&) = default;
+  RAJA_INLINE ViewBase& operator=(ViewBase&&)      = default;
 
-    using type = RAJA::expt::TensorIndex<arg_type, VectorType, DIM>;
+#endif
 
-    static constexpr RAJA_HOST_DEVICE RAJA_INLINE
-    type extract(RAJA::expt::TensorIndex<Arg, VectorType, DIM> vec_arg){
-      return type(stripIndexType(*vec_arg), vec_arg.size());
-    }
-  };
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr ViewBase(pointer_type data, layout_type&& layout)
+      : m_data(data), m_layout(layout)
+  {}
 
-  /**
-   * Specialization where expected type is wrapped in a StaticTensorIndex type
-   *
-   * In this case, there is no StaticTensorIndex to unpack, just strip any strongly
-   * typed indices.
-   */
-  template<typename Expected, typename Arg, typename VectorType, camp::idx_t DIM, Arg BEGIN, strip_index_type_t<Arg> LENGTH>
-  struct MatchTypedViewArgHelper<Expected, RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<Arg, VectorType, DIM, BEGIN, LENGTH>> >{
+  template <typename... Args>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr ViewBase(pointer_type data,
+                                                  Args... dim_sizes)
+      : m_data(data), m_layout(dim_sizes...)
+  {}
 
-    static_assert(std::is_convertible<strip_index_type_t<Arg>, strip_index_type_t<Expected>>::value,
-        "Argument isn't compatible");
 
-    using arg_type = strip_index_type_t<Arg>;
+  template <bool IsConstView = std::is_const<value_type>::value>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr ViewBase(
+      typename std::enable_if<IsConstView, NonConstView>::type const& rhs)
+      : m_data(rhs.get_data()), m_layout(rhs.get_layout())
+  {}
 
-    using type = RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<arg_type, VectorType, DIM, BEGIN, LENGTH>>;
 
-    static constexpr RAJA_HOST_DEVICE RAJA_INLINE
-    type extract(RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<Arg, VectorType, DIM, BEGIN, LENGTH>> RAJA_UNUSED_ARG(vec_arg)){
-      return type();
-    }
-  };
-#endif
+  RAJA_HOST_DEVICE
+  RAJA_INLINE void set_data(PointerType data_ptr) { m_data = data_ptr; }
 
-  } //namespace detail
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr pointer_type const& get_data() const { return m_data; }
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr layout_type const& get_layout() const { return m_layout; }
 
-  template<typename Expected, typename Arg>
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  constexpr
-  typename detail::MatchTypedViewArgHelper<Expected, Arg>::type
-  match_typed_view_arg(Arg const &arg)
+  constexpr linear_index_type size() const { return m_layout.size(); }
+
+
+  template <camp::idx_t DIM>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr linear_index_type get_dim_size() const
   {
-    return detail::MatchTypedViewArgHelper<Expected, Arg>::extract(arg);
+    return m_layout.template get_dim_size<DIM>();
   }
 
 
+  template <typename... Args>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
+                                                            pointer_type,
+                                                            linear_index_type,
+                                                            layout_type,
+                                                            Args...>
+  operator()(Args... args) const
+  {
+    return view_make_return_value<value_type, linear_index_type>(
+        m_layout, m_data, args...);
+  }
 
-template <typename ValueType,
-          typename PointerType,
-          typename LayoutType>
-class ViewBase {
-
-  public:
-    using value_type = ValueType;
-    using pointer_type = PointerType;
-    using layout_type = LayoutType;
-    using linear_index_type = typename layout_type::IndexLinear;
-    using nc_value_type = typename std::remove_const<value_type>::type;
-    using nc_pointer_type = typename std::add_pointer<typename std::remove_const<
-        typename std::remove_pointer<pointer_type>::type>::type>::type;
-
-    using Self = ViewBase<value_type, pointer_type, layout_type>;
-    using NonConstView = ViewBase<nc_value_type, nc_pointer_type, layout_type>;
-
-    using shifted_layout_type = typename add_offset<layout_type>::type;
-    using ShiftedView = ViewBase<value_type, pointer_type, shifted_layout_type>;
-
-  protected:
-    pointer_type m_data;
-    layout_type const m_layout;
-
-  public:
-
-
-    /*
-     * Defaulted operators (AJK):
-     *
-     * OpenMP Target currently needs the View classes to be trivially copyable,
-     * which means that we need to use the default ctor's and assignment
-     * operators.
-     *
-     * These defaulted operators cause issues with some versions of CUDA, so
-     * in the case that CUDA is enabled, we switch to explicitly defined
-     * operators.
-     */
-#if (defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_CLANG_CUDA))
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr ViewBase(){};
 
-    RAJA_HOST_DEVICE
-    RAJA_INLINE ViewBase(ViewBase const &c)
-      : m_layout(c.m_layout), m_data(c.m_data)
-    {
-    }
-
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    ViewBase &operator=(ViewBase const &c)
-    {
-      m_layout = c.m_layout;
-      m_data = c.m_data;
-    }
-#else
-    constexpr ViewBase() = default;
-    RAJA_INLINE constexpr ViewBase(ViewBase const &) = default;
-    RAJA_INLINE constexpr ViewBase(ViewBase &&) = default;
-    RAJA_INLINE ViewBase& operator=(ViewBase const &) = default;
-    RAJA_INLINE ViewBase& operator=(ViewBase &&) = default;
+  /*
+   * Compatibility note (AJK):
+   * We are using variadic arguments even though operator[] takes exactly 1
+   * argument This gets around a template instantiation bug in CUDA/nvcc 9.1,
+   * which seems to have been fixed in CUDA 9.2+
+   */
+  template <typename... Args>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
+                                                            pointer_type,
+                                                            linear_index_type,
+                                                            layout_type,
+                                                            Args...>
+  operator[](Args... args) const
+  {
+    return view_make_return_value<value_type, linear_index_type>(
+        m_layout, m_data, args...);
+  }
 
-#endif
 
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    ViewBase(pointer_type data, layout_type &&layout) :
-    m_data(data), m_layout(layout)
-    {
-    }
-
-    template <typename... Args>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    ViewBase(pointer_type data, Args... dim_sizes) :
-    m_data(data), m_layout(dim_sizes...)
-    {
-    }
-
-
-    template <bool IsConstView = std::is_const<value_type>::value>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    ViewBase(typename std::enable_if<IsConstView, NonConstView>::type const &rhs) :
-    m_data(rhs.get_data()), m_layout(rhs.get_layout())
-    {
-    }
-
-
-    RAJA_HOST_DEVICE
-    RAJA_INLINE void set_data(PointerType data_ptr){
-      m_data = data_ptr;
-    }
-
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    pointer_type const &get_data() const
-    {
-      return m_data;
-    }
-
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    layout_type const &get_layout() const
-    {
-      return m_layout;
-    }
-
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    linear_index_type size() const
-    {
-      return m_layout.size();
-    }
-
-
-    template<camp::idx_t DIM>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    linear_index_type get_dim_size() const
-    {
-      return m_layout.template get_dim_size<DIM>();
-    }
-
-
-    template <typename... Args>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    view_return_type_t<value_type, pointer_type, linear_index_type, layout_type, Args...>
-    operator()(Args... args) const
-    {
-      return view_make_return_value<value_type, linear_index_type>(m_layout, m_data, args...);
-    }
-
-
-
-    /*
-     * Compatibility note (AJK):
-     * We are using variadic arguments even though operator[] takes exactly 1 argument
-     * This gets around a template instantiation bug in CUDA/nvcc 9.1, which seems to have
-     * been fixed in CUDA 9.2+
-     */
-    template <typename ... Args>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    view_return_type_t<value_type, pointer_type, linear_index_type, layout_type, Args...>
-    operator[](Args ... args) const
-    {
-      return view_make_return_value<value_type, linear_index_type>(m_layout, m_data, args...);
-    }
-
-
-
-    template <size_t n_dims = layout_type::n_dims, typename IdxLin = linear_index_type>
-    RAJA_INLINE
-    ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
-    {
-      static_assert(n_dims==layout_type::n_dims, "Dimension mismatch in view shift");
-
-      shifted_layout_type shift_layout(m_layout);
-      shift_layout.shift(shift);
-
-      return ShiftedView(m_data, shift_layout);
-    }
+  template <size_t n_dims   = layout_type::n_dims,
+            typename IdxLin = linear_index_type>
+  RAJA_INLINE ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
+  {
+    static_assert(n_dims == layout_type::n_dims,
+                  "Dimension mismatch in view shift");
+
+    shifted_layout_type shift_layout(m_layout);
+    shift_layout.shift(shift);
 
+    return ShiftedView(m_data, shift_layout);
+  }
 };
 
 
 template <typename ValueType,
-        typename PointerType,
-        typename LayoutType,
-        typename IndexTypes>
+          typename PointerType,
+          typename LayoutType,
+          typename IndexTypes>
 class TypedViewBase;
 
 template <typename ValueType,
           typename PointerType,
           typename LayoutType,
           typename... IndexTypes>
-class TypedViewBase<ValueType, PointerType, LayoutType, camp::list<IndexTypes...>> :
-  public ViewBase<ValueType, PointerType, LayoutType>
+class TypedViewBase<ValueType,
+                    PointerType,
+                    LayoutType,
+                    camp::list<IndexTypes...>>
+    : public ViewBase<ValueType, PointerType, LayoutType>
 {
 
-  public:
-    using value_type = ValueType;
-    using pointer_type = PointerType;
-    using layout_type = LayoutType;
-    using linear_index_type = typename layout_type::IndexLinear;
-    using nc_value_type = typename std::remove_const<value_type>::type;
-    using nc_pointer_type = typename std::add_pointer<typename std::remove_const<
-        typename std::remove_pointer<pointer_type>::type>::type>::type;
-
-    using Base = ViewBase<ValueType, PointerType, LayoutType>;
-    using Self = TypedViewBase<value_type, pointer_type, layout_type, camp::list<IndexTypes...> >;
-    using NonConstView = TypedViewBase<nc_value_type, nc_pointer_type, layout_type, camp::list<IndexTypes...> >;
-
-    using shifted_layout_type = typename add_offset<layout_type>::type;
-    using ShiftedView = TypedViewBase<value_type, pointer_type, shifted_layout_type, camp::list<IndexTypes...> >;
-
-    static constexpr size_t n_dims = sizeof...(IndexTypes);
-
-    using Base::Base;
-
-    template <typename... Args>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    view_return_type_t<value_type, pointer_type, linear_index_type, layout_type, Args...>
-    operator()(Args... args) const
-    {
-      return view_make_return_value<value_type, linear_index_type>(Base::m_layout, Base::m_data, match_typed_view_arg<IndexTypes>(args)...);
-    }
-
-
-
-    /*
-     * Compatibility note (AJK):
-     * We are using variadic arguments even though operator[] takes exactly 1 argument
-     * This gets around a template instantiation bug in CUDA/nvcc 9.1, which seems to have
-     * been fixed in CUDA 9.2+
-     */
-    template <typename ... Args>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    view_return_type_t<value_type, pointer_type, linear_index_type, layout_type, Args...>
-    operator[](Args ... args) const
-    {
-      return view_make_return_value<value_type, linear_index_type>(Base::m_layout, Base::m_data, match_typed_view_arg<IndexTypes>(args)...);
-    }
+public:
+  using value_type        = ValueType;
+  using pointer_type      = PointerType;
+  using layout_type       = LayoutType;
+  using linear_index_type = typename layout_type::IndexLinear;
+  using nc_value_type     = typename std::remove_const<value_type>::type;
+  using nc_pointer_type = typename std::add_pointer<typename std::remove_const<
+      typename std::remove_pointer<pointer_type>::type>::type>::type;
+
+  using Base         = ViewBase<ValueType, PointerType, LayoutType>;
+  using Self         = TypedViewBase<value_type,
+                             pointer_type,
+                             layout_type,
+                             camp::list<IndexTypes...>>;
+  using NonConstView = TypedViewBase<nc_value_type,
+                                     nc_pointer_type,
+                                     layout_type,
+                                     camp::list<IndexTypes...>>;
+
+  using shifted_layout_type = typename add_offset<layout_type>::type;
+  using ShiftedView         = TypedViewBase<value_type,
+                                    pointer_type,
+                                    shifted_layout_type,
+                                    camp::list<IndexTypes...>>;
+
+  static constexpr size_t n_dims = sizeof...(IndexTypes);
+
+  using Base::Base;
+
+  template <typename... Args>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
+                                                            pointer_type,
+                                                            linear_index_type,
+                                                            layout_type,
+                                                            Args...>
+  operator()(Args... args) const
+  {
+    return view_make_return_value<value_type, linear_index_type>(
+        Base::m_layout, Base::m_data,
+        match_typed_view_arg<IndexTypes>(args)...);
+  }
 
 
+  /*
+   * Compatibility note (AJK):
+   * We are using variadic arguments even though operator[] takes exactly 1
+   * argument This gets around a template instantiation bug in CUDA/nvcc 9.1,
+   * which seems to have been fixed in CUDA 9.2+
+   */
+  template <typename... Args>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
+                                                            pointer_type,
+                                                            linear_index_type,
+                                                            layout_type,
+                                                            Args...>
+  operator[](Args... args) const
+  {
+    return view_make_return_value<value_type, linear_index_type>(
+        Base::m_layout, Base::m_data,
+        match_typed_view_arg<IndexTypes>(args)...);
+  }
 
-    template <size_t n_dims = sizeof...(IndexTypes), typename IdxLin = linear_index_type>
-    RAJA_INLINE
-    ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
-    {
-      static_assert(n_dims==layout_type::n_dims, "Dimension mismatch in view shift");
 
-      shifted_layout_type shift_layout(Base::get_layout());
-      shift_layout.shift(shift);
+  template <size_t n_dims   = sizeof...(IndexTypes),
+            typename IdxLin = linear_index_type>
+  RAJA_INLINE ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
+  {
+    static_assert(n_dims == layout_type::n_dims,
+                  "Dimension mismatch in view shift");
 
-      return ShiftedView(Base::get_data(), shift_layout);
-    }
+    shifted_layout_type shift_layout(Base::get_layout());
+    shift_layout.shift(shift);
 
+    return ShiftedView(Base::get_data(), shift_layout);
+  }
 };
 
 
-
-} // namespace internal
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/util/View.hpp b/include/RAJA/util/View.hpp
index fcaee67f98..be3db700a6 100644
--- a/include/RAJA/util/View.hpp
+++ b/include/RAJA/util/View.hpp
@@ -32,66 +32,59 @@
 namespace RAJA
 {
 
-//Helpers to convert
-//layouts -> OffsetLayouts
-//Typedlayouts -> TypedOffsetLayouts
-template<typename layout>
+// Helpers to convert
+// layouts -> OffsetLayouts
+// Typedlayouts -> TypedOffsetLayouts
+template <typename layout>
 struct add_offset
 {
   using type = RAJA::OffsetLayout<layout::n_dims>;
 };
 
-template<typename IdxLin, typename...DimTypes>
-struct add_offset<RAJA::TypedLayout<IdxLin,camp::tuple<DimTypes...>>>
+template <typename IdxLin, typename... DimTypes>
+struct add_offset<RAJA::TypedLayout<IdxLin, camp::tuple<DimTypes...>>>
 {
-  using type = RAJA::TypedOffsetLayout<IdxLin,camp::tuple<DimTypes...>>;
+  using type = RAJA::TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>;
 };
 
 template <typename ValueType,
           typename LayoutType,
-          typename PointerType = ValueType *>
-using View =
-    internal::ViewBase<ValueType, PointerType, LayoutType>;
-
+          typename PointerType = ValueType*>
+using View = internal::ViewBase<ValueType, PointerType, LayoutType>;
 
 
 template <typename ValueType, typename LayoutType, typename... IndexTypes>
-using TypedView =
-    internal::TypedViewBase<ValueType, ValueType *, LayoutType, camp::list<IndexTypes...> >;
-
-
-
+using TypedView = internal::
+    TypedViewBase<ValueType, ValueType*, LayoutType, camp::list<IndexTypes...>>;
 
 
 template <typename IndexType, typename ValueType>
-RAJA_INLINE View<ValueType, Layout<1, IndexType, 0> > make_view(
-    ValueType *ptr)
+RAJA_INLINE View<ValueType, Layout<1, IndexType, 0>> make_view(ValueType* ptr)
 {
-  return View<ValueType, Layout<1, IndexType, 0> >(ptr, 1);
+  return View<ValueType, Layout<1, IndexType, 0>>(ptr, 1);
 }
 
-template <size_t n_dims, typename IndexType, typename ValueType, typename... IndexTypes>
-RAJA_INLINE View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...> > make_index_view(
-    ValueType *ptr, IndexLayout<n_dims, IndexType, IndexTypes...> index_layout)
+template <size_t n_dims,
+          typename IndexType,
+          typename ValueType,
+          typename... IndexTypes>
+RAJA_INLINE View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...>>
+make_index_view(ValueType* ptr,
+                IndexLayout<n_dims, IndexType, IndexTypes...> index_layout)
 {
-  return View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...> >(ptr, index_layout);
+  return View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...>>(
+      ptr, index_layout);
 }
 
 
 // select certain indices from a tuple, given a curated index sequence
 // returns linear index of layout(ar...)
 template <typename Lay, typename Tup, camp::idx_t... Idxs>
-RAJA_HOST_DEVICE RAJA_INLINE 
-auto selecttuple( Lay lyout, Tup&& tup, camp::idx_seq<Idxs...> ) ->
-  decltype(
-            lyout(
-              camp::get<Idxs>(std::forward<Tup>(tup))...
-            )
-          )
-{ 
-  return lyout(
-                camp::get<Idxs>(std::forward<Tup>(tup))...
-              );
+RAJA_HOST_DEVICE RAJA_INLINE auto
+selecttuple(Lay lyout, Tup&& tup, camp::idx_seq<Idxs...>)
+    -> decltype(lyout(camp::get<Idxs>(std::forward<Tup>(tup))...))
+{
+  return lyout(camp::get<Idxs>(std::forward<Tup>(tup))...);
 }
 
 // sequence combiner
@@ -99,9 +92,7 @@ template <typename Seq1, typename Seq2>
 struct cat_seq;
 
 template <camp::idx_t... Idxs1, camp::idx_t... Idxs2>
-struct cat_seq  < camp::idx_seq<Idxs1...>,
-                  camp::idx_seq<Idxs2...>
-                >
+struct cat_seq<camp::idx_seq<Idxs1...>, camp::idx_seq<Idxs2...>>
 {
   using type = camp::idx_seq<Idxs1..., Idxs2...>;
 };
@@ -116,7 +107,7 @@ struct offset_seq;
 template <camp::idx_t Offset, camp::idx_t... Idxs>
 struct offset_seq<Offset, camp::idx_seq<Idxs...>>
 {
-  using type = camp::idx_seq<(Idxs+Offset)...>;
+  using type = camp::idx_seq<(Idxs + Offset)...>;
 };
 
 template <camp::idx_t Offset, typename Seq>
@@ -125,60 +116,50 @@ using offset_seq_t = typename offset_seq<Offset, Seq>::type;
 // remove the Nth index in a parameter pack
 // returns linear index of layout(ar...)
 template <typename Lay, RAJA::Index_type Nth = 0, typename Tup>
-RAJA_HOST_DEVICE RAJA_INLINE auto removenth( Lay lyout, Tup&& tup ) ->
-  decltype( selecttuple<Lay>(
-              lyout,
-              std::forward<Tup>(tup),
-              cat_seq_t<  camp::make_idx_seq_t<Nth>,  // sequence up to Nth
-                          offset_seq_t<
-                            Nth+1,  // after Nth
-                            camp::make_idx_seq_t<camp::tuple_size<Tup>::value - Nth-1>
-                          > // sequence after Nth
-                       >{}
-            )
-          )
+RAJA_HOST_DEVICE RAJA_INLINE auto
+removenth(Lay lyout, Tup&& tup) -> decltype(selecttuple<Lay>(
+    lyout,
+    std::forward<Tup>(tup),
+    cat_seq_t<camp::make_idx_seq_t<Nth>,  // sequence up to Nth
+              offset_seq_t<Nth + 1,       // after Nth
+                           camp::make_idx_seq_t<camp::tuple_size<Tup>::value -
+                                                Nth - 1>>  // sequence after Nth
+              > {}))
 {
   return selecttuple<Lay>(
-              lyout,
-              std::forward<Tup>(tup),
-              cat_seq_t<  camp::make_idx_seq_t<Nth>,  // sequence up to Nth
-                          offset_seq_t<
-                            Nth+1,  // after Nth
-                            camp::make_idx_seq_t<camp::tuple_size<Tup>::value - Nth-1>
-                          > // sequence after Nth
-                       >{}
-          );
+      lyout, std::forward<Tup>(tup),
+      cat_seq_t<camp::make_idx_seq_t<Nth>,  // sequence up to Nth
+                offset_seq_t<Nth + 1,       // after Nth
+                             camp::make_idx_seq_t<camp::tuple_size<Tup>::value -
+                                                  Nth - 1>>  // sequence after
+                                                             // Nth
+                > {});
 }
 
 
-
-
-// P2Pidx represents the array-of-pointers index. This allows the position of the
-// index into the array-of-pointers to be moved around in the MultiView operator();
-// see the operator overload.
-// Default of 0 means that the p2p index is in the 0th position.
-template <typename ValueType,
-          typename LayoutType,
-          RAJA::Index_type P2Pidx = 0,
-          typename PointerType = ValueType **,
-          typename NonConstPointerType =
-              camp::type::ptr::add< // adds *
-                camp::type::ptr::add<
-                  camp::type::cv::rem<  // removes cv
-                    camp::type::ptr::rem<
-                      camp::type::ptr::rem<PointerType>  // removes *
-                    >
-                  >
-                >
-              >
-          >
-struct MultiView {
-  using value_type = ValueType;
-  using pointer_type = PointerType;
-  using layout_type = LayoutType;
-  using nc_value_type = camp::decay<value_type>;
+// P2Pidx represents the array-of-pointers index. This allows the position of
+// the index into the array-of-pointers to be moved around in the MultiView
+// operator(); see the operator overload. Default of 0 means that the p2p index
+// is in the 0th position.
+template <
+    typename ValueType,
+    typename LayoutType,
+    RAJA::Index_type P2Pidx      = 0,
+    typename PointerType         = ValueType**,
+    typename NonConstPointerType = camp::type::ptr::add<  // adds *
+        camp::type::ptr::add<camp::type::cv::rem<         // removes cv
+            camp::type::ptr::rem<camp::type::ptr::rem<PointerType>  // removes
+                                                                    // *
+                                 >>>>>
+struct MultiView
+{
+  using value_type      = ValueType;
+  using pointer_type    = PointerType;
+  using layout_type     = LayoutType;
+  using nc_value_type   = camp::decay<value_type>;
   using nc_pointer_type = NonConstPointerType;
-  using NonConstView = MultiView<nc_value_type, layout_type, P2Pidx, nc_pointer_type>;
+  using NonConstView =
+      MultiView<nc_value_type, layout_type, P2Pidx, nc_pointer_type>;
 
   layout_type const layout;
   nc_pointer_type data;
@@ -186,39 +167,38 @@ struct MultiView {
   template <typename... Args>
   RAJA_INLINE constexpr MultiView(pointer_type data_ptr, Args... dim_sizes)
       : layout(dim_sizes...), data(data_ptr)
-  {
-  }
+  {}
 
-  RAJA_INLINE constexpr MultiView(pointer_type data_ptr, layout_type &&layout)
+  RAJA_INLINE constexpr MultiView(pointer_type data_ptr, layout_type&& layout)
       : layout(layout), data(data_ptr)
-  {
-  }
+  {}
 
-  RAJA_INLINE constexpr MultiView(MultiView const &) = default;
-  RAJA_INLINE constexpr MultiView(MultiView &&) = default;
-  RAJA_INLINE MultiView& operator=(MultiView const &) = default;
-  RAJA_INLINE MultiView& operator=(MultiView &&) = default;
+  RAJA_INLINE constexpr MultiView(MultiView const&)  = default;
+  RAJA_INLINE constexpr MultiView(MultiView&&)       = default;
+  RAJA_INLINE MultiView& operator=(MultiView const&) = default;
+  RAJA_INLINE MultiView& operator=(MultiView&&)      = default;
 
   template <bool IsConstView = std::is_const<value_type>::value>
   RAJA_INLINE constexpr MultiView(
-      typename std::enable_if<IsConstView, NonConstView>::type const &rhs)
-      : layout(rhs.layout),
-        data(rhs.data)
-  {
-  }
+      typename std::enable_if<IsConstView, NonConstView>::type const& rhs)
+      : layout(rhs.layout), data(rhs.data)
+  {}
 
   RAJA_INLINE void set_data(pointer_type data_ptr) { data = data_ptr; }
 
-  template <size_t n_dims=layout_type::n_dims, typename IdxLin = Index_type>
-  RAJA_INLINE RAJA::MultiView<ValueType, typename add_offset<layout_type>::type, P2Pidx>
-  shift(const std::array<IdxLin, n_dims>& shift)
+  template <size_t n_dims = layout_type::n_dims, typename IdxLin = Index_type>
+  RAJA_INLINE
+      RAJA::MultiView<ValueType, typename add_offset<layout_type>::type, P2Pidx>
+      shift(const std::array<IdxLin, n_dims>& shift)
   {
-    static_assert(n_dims==layout_type::n_dims, "Dimension mismatch in view shift");
+    static_assert(n_dims == layout_type::n_dims,
+                  "Dimension mismatch in view shift");
 
     typename add_offset<layout_type>::type shift_layout(layout);
     shift_layout.shift(shift);
 
-    return RAJA::MultiView<ValueType, typename add_offset<layout_type>::type, P2Pidx>(data, shift_layout);
+    return RAJA::MultiView<ValueType, typename add_offset<layout_type>::type,
+                           P2Pidx>(data, shift_layout);
   }
 
   // Moving the position of the index into the array-of-pointers
@@ -226,26 +206,30 @@ struct MultiView {
   // making this specifically typed would require unpacking the layout,
   // this is easier to maintain
   template <typename... Args>
-  RAJA_HOST_DEVICE RAJA_INLINE value_type &operator()(Args... ar) const
+  RAJA_HOST_DEVICE RAJA_INLINE value_type& operator()(Args... ar) const
   {
-    auto pidx = stripIndexType( camp::get<P2Pidx>( camp::forward_as_tuple( ar... ) ) );
+    auto pidx =
+        stripIndexType(camp::get<P2Pidx>(camp::forward_as_tuple(ar...)));
 
-    if ( pidx < 0 )
+    if (pidx < 0)
     {
-      RAJA_ABORT_OR_THROW( "Negative index while accessing array of pointers.\n" );
+      RAJA_ABORT_OR_THROW(
+          "Negative index while accessing array of pointers.\n");
     }
-    
-    auto idx = stripIndexType( removenth<LayoutType, P2Pidx>( layout, camp::forward_as_tuple( ar... ) ) );
+
+    auto idx = stripIndexType(
+        removenth<LayoutType, P2Pidx>(layout, camp::forward_as_tuple(ar...)));
     return data[pidx][idx];
   }
 };
 
 template <typename ViewType, typename AtomicPolicy = RAJA::auto_atomic>
-struct AtomicViewWrapper {
-  using base_type = ViewType;
+struct AtomicViewWrapper
+{
+  using base_type    = ViewType;
   using pointer_type = typename base_type::pointer_type;
-  using value_type = typename base_type::value_type;
-  using atomic_type = RAJA::AtomicRef<value_type, AtomicPolicy>;
+  using value_type   = typename base_type::value_type;
+  using atomic_type  = RAJA::AtomicRef<value_type, AtomicPolicy>;
 
   base_type base_;
 
@@ -255,7 +239,7 @@ struct AtomicViewWrapper {
   RAJA_INLINE void set_data(pointer_type data_ptr) { base_.set_data(data_ptr); }
 
   template <typename... ARGS>
-  RAJA_HOST_DEVICE RAJA_INLINE atomic_type operator()(ARGS &&... args) const
+  RAJA_HOST_DEVICE RAJA_INLINE atomic_type operator()(ARGS&&... args) const
   {
     return atomic_type(&base_.operator()(std::forward<ARGS>(args)...));
   }
@@ -267,21 +251,22 @@ struct AtomicViewWrapper {
  * for performance
  */
 template <typename ViewType>
-struct AtomicViewWrapper<ViewType, RAJA::seq_atomic> {
-  using base_type = ViewType;
+struct AtomicViewWrapper<ViewType, RAJA::seq_atomic>
+{
+  using base_type    = ViewType;
   using pointer_type = typename base_type::pointer_type;
-  using value_type = typename base_type::value_type;
-  using atomic_type = RAJA::AtomicRef<value_type, RAJA::seq_atomic>;
+  using value_type   = typename base_type::value_type;
+  using atomic_type  = RAJA::AtomicRef<value_type, RAJA::seq_atomic>;
 
   base_type base_;
 
   RAJA_INLINE
-  constexpr explicit AtomicViewWrapper(ViewType const &view) : base_{view} {}
+  constexpr explicit AtomicViewWrapper(ViewType const& view) : base_ {view} {}
 
   RAJA_INLINE void set_data(pointer_type data_ptr) { base_.set_data(data_ptr); }
 
   template <typename... ARGS>
-  RAJA_HOST_DEVICE RAJA_INLINE value_type &operator()(ARGS &&... args) const
+  RAJA_HOST_DEVICE RAJA_INLINE value_type& operator()(ARGS&&... args) const
   {
     return base_.operator()(std::forward<ARGS>(args)...);
   }
@@ -289,8 +274,8 @@ struct AtomicViewWrapper<ViewType, RAJA::seq_atomic> {
 
 
 template <typename AtomicPolicy, typename ViewType>
-RAJA_INLINE AtomicViewWrapper<ViewType, AtomicPolicy> make_atomic_view(
-    ViewType const &view)
+RAJA_INLINE AtomicViewWrapper<ViewType, AtomicPolicy>
+make_atomic_view(ViewType const& view)
 {
 
   return RAJA::AtomicViewWrapper<ViewType, AtomicPolicy>(view);
diff --git a/include/RAJA/util/align.hpp b/include/RAJA/util/align.hpp
index 7103ecb152..23ccbee14c 100644
--- a/include/RAJA/util/align.hpp
+++ b/include/RAJA/util/align.hpp
@@ -32,16 +32,20 @@ void* align(size_t alignment, size_t size, void*& ptr, size_t& space)
 {
 
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( disable : 4146 )  // Force msvc to ignore subtracting from signed number warning
+#pragma warning(disable : 4146)  // Force msvc to ignore subtracting from signed
+                                 // number warning
 #endif
   void* r = nullptr;
-  if (size <= space) {
+  if (size <= space)
+  {
     char* p1 = static_cast<char*>(ptr);
     char* p2 = reinterpret_cast<char*>(
-        reinterpret_cast<size_t>(p1 + (static_cast<ptrdiff_t>(alignment) - 1)) & -alignment);
+        reinterpret_cast<size_t>(p1 + (static_cast<ptrdiff_t>(alignment) - 1)) &
+        -alignment);
     size_t d = static_cast<size_t>(p2 - p1);
-    if (d <= space - size) {
-      r = p2;
+    if (d <= space - size)
+    {
+      r   = p2;
       ptr = r;
       space -= d;
     }
@@ -49,9 +53,9 @@ void* align(size_t alignment, size_t size, void*& ptr, size_t& space)
   return r;
 
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( default : 4146 )  // Force msvc to ignore subtracting from signed number warning
+#pragma warning(default : 4146)  // Force msvc to ignore subtracting from signed
+                                 // number warning
 #endif
-
 }
 
 }  // end namespace RAJA
diff --git a/include/RAJA/util/basic_mempool.hpp b/include/RAJA/util/basic_mempool.hpp
index f0208ccbd3..3b488a81ec 100644
--- a/include/RAJA/util/basic_mempool.hpp
+++ b/include/RAJA/util/basic_mempool.hpp
@@ -54,27 +54,28 @@ namespace detail
 class MemoryArena
 {
 public:
-  using free_type = std::map<void*, void*>;
+  using free_type       = std::map<void*, void*>;
   using free_value_type = typename free_type::value_type;
-  using used_type = std::map<void*, void*>;
+  using used_type       = std::map<void*, void*>;
   using used_value_type = typename used_type::value_type;
 
   MemoryArena(void* ptr, size_t size)
-    : m_allocation{ ptr, static_cast<char*>(ptr)+size },
-      m_free_space(),
-      m_used_space()
+      : m_allocation {ptr, static_cast<char*>(ptr) + size},
+        m_free_space(),
+        m_used_space()
   {
-     m_free_space[ptr] = static_cast<char*>(ptr)+size ;
-    if (m_allocation.begin == nullptr) {
+    m_free_space[ptr] = static_cast<char*>(ptr) + size;
+    if (m_allocation.begin == nullptr)
+    {
       fprintf(stderr, "Attempt to create MemoryArena with no memory");
       std::abort();
     }
   }
 
-  MemoryArena(MemoryArena const&) = delete;
+  MemoryArena(MemoryArena const&)            = delete;
   MemoryArena& operator=(MemoryArena const&) = delete;
 
-  MemoryArena(MemoryArena&&) = default;
+  MemoryArena(MemoryArena&&)            = default;
   MemoryArena& operator=(MemoryArena&&) = default;
 
   size_t capacity()
@@ -90,21 +91,22 @@ class MemoryArena
   void* get(size_t nbytes, size_t alignment)
   {
     void* ptr_out = nullptr;
-    if (capacity() >= nbytes) {
+    if (capacity() >= nbytes)
+    {
       free_type::iterator end = m_free_space.end();
-      for (free_type::iterator iter = m_free_space.begin(); iter != end;
-           ++iter) {
+      for (free_type::iterator iter = m_free_space.begin(); iter != end; ++iter)
+      {
 
         void* adj_ptr = iter->first;
         size_t cap =
             static_cast<char*>(iter->second) - static_cast<char*>(adj_ptr);
 
-        if (::RAJA::align(alignment, nbytes, adj_ptr, cap)) {
+        if (::RAJA::align(alignment, nbytes, adj_ptr, cap))
+        {
 
           ptr_out = adj_ptr;
 
-          remove_free_chunk(iter,
-                            adj_ptr,
+          remove_free_chunk(iter, adj_ptr,
                             static_cast<char*>(adj_ptr) + nbytes);
 
           add_used_chunk(adj_ptr, static_cast<char*>(adj_ptr) + nbytes);
@@ -118,29 +120,35 @@ class MemoryArena
 
   bool give(void* ptr)
   {
-    if (m_allocation.begin <= ptr && ptr < m_allocation.end) {
+    if (m_allocation.begin <= ptr && ptr < m_allocation.end)
+    {
 
       used_type::iterator found = m_used_space.find(ptr);
 
-      if (found != m_used_space.end()) {
+      if (found != m_used_space.end())
+      {
 
         add_free_chunk(found->first, found->second);
 
         m_used_space.erase(found);
-
-      } else {
+      }
+      else
+      {
         fprintf(stderr, "Invalid free %p", ptr);
         std::abort();
       }
 
       return true;
-    } else {
+    }
+    else
+    {
       return false;
     }
   }
 
 private:
-  struct memory_chunk {
+  struct memory_chunk
+  {
     void* begin;
     void* end;
   };
@@ -152,19 +160,23 @@ class MemoryArena
     free_type::iterator next = m_free_space.lower_bound(begin);
 
     // check if prev exists
-    if (next != m_free_space.begin()) {
+    if (next != m_free_space.begin())
+    {
       // check if prev can cover [begin, end)
       free_type::iterator prev = next;
       --prev;
-      if (prev->second == begin) {
+      if (prev->second == begin)
+      {
         // extend prev to cover [begin, end)
         prev->second = end;
 
         // check if prev can cover next too
-        if (next != invl) {
+        if (next != invl)
+        {
           assert(next->first != begin);
 
-          if (next->first == end) {
+          if (next->first == end)
+          {
             // extend prev to cover next too
             prev->second = next->second;
 
@@ -176,12 +188,14 @@ class MemoryArena
       }
     }
 
-    if (next != invl) {
+    if (next != invl)
+    {
       assert(next->first != begin);
 
-      if (next->first == end) {
+      if (next->first == end)
+      {
         // extend next to cover [begin, end)
-        m_free_space.insert(next, free_value_type{begin, next->second});
+        m_free_space.insert(next, free_value_type {begin, next->second});
         m_free_space.erase(next);
 
         return;
@@ -190,38 +204,42 @@ class MemoryArena
 
     // no free space adjacent to this chunk, add seperate free chunk [begin,
     // end)
-    m_free_space.insert(next, free_value_type{begin, end});
+    m_free_space.insert(next, free_value_type {begin, end});
   }
 
   void remove_free_chunk(free_type::iterator iter, void* begin, void* end)
   {
 
-    void* ptr = iter->first;
+    void* ptr     = iter->first;
     void* ptr_end = iter->second;
 
     // fixup m_free_space, shrinking and adding chunks as needed
-    if (ptr != begin) {
+    if (ptr != begin)
+    {
 
       // shrink end of current free region to [ptr, begin)
       iter->second = begin;
 
-      if (end != ptr_end) {
+      if (end != ptr_end)
+      {
 
         // insert free region [end, ptr_end) after current free region
         free_type::iterator next = iter;
         ++next;
-        m_free_space.insert(next, free_value_type{end, ptr_end});
+        m_free_space.insert(next, free_value_type {end, ptr_end});
       }
-
-    } else if (end != ptr_end) {
+    }
+    else if (end != ptr_end)
+    {
 
       // shrink beginning of current free region to [end, ptr_end)
       free_type::iterator next = iter;
       ++next;
-      m_free_space.insert(next, free_value_type{end, ptr_end});
+      m_free_space.insert(next, free_value_type {end, ptr_end});
       m_free_space.erase(iter);
-
-    } else {
+    }
+    else
+    {
 
       // can not reuse current region, erase
       m_free_space.erase(iter);
@@ -231,7 +249,7 @@ class MemoryArena
   void add_used_chunk(void* begin, void* end)
   {
     // simply inserts a chunk of memory into used_space
-    m_used_space.insert(used_value_type{begin, end});
+    m_used_space.insert(used_value_type {begin, end});
   }
 
   memory_chunk m_allocation;
@@ -290,7 +308,7 @@ class MemPool
 
   static inline MemPool<allocator_t>& getInstance()
   {
-    static MemPool<allocator_t> pool{};
+    static MemPool<allocator_t> pool {};
     return pool;
   }
 
@@ -298,8 +316,7 @@ class MemPool
 
   MemPool()
       : m_arenas(), m_default_arena_size(default_default_arena_size), m_alloc()
-  {
-  }
+  {}
 
   ~MemPool()
   {
@@ -316,7 +333,8 @@ class MemPool
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
 
-    while (!m_arenas.empty()) {
+    while (!m_arenas.empty())
+    {
       void* allocation_ptr = m_arenas.front().get_allocation();
       m_alloc.free(allocation_ptr);
       m_arenas.pop_front();
@@ -338,7 +356,7 @@ class MemPool
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
 
-    size_t prev_size = m_default_arena_size;
+    size_t prev_size     = m_default_arena_size;
     m_default_arena_size = new_size;
     return prev_size;
   }
@@ -350,22 +368,26 @@ class MemPool
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
 
-    const size_t size = nTs * sizeof(T);
-    void* ptr = nullptr;
+    const size_t size                  = nTs * sizeof(T);
+    void* ptr                          = nullptr;
     arena_container_type::iterator end = m_arenas.end();
     for (arena_container_type::iterator iter = m_arenas.begin(); iter != end;
-         ++iter) {
+         ++iter)
+    {
       ptr = iter->get(size, alignment);
-      if (ptr != nullptr) {
+      if (ptr != nullptr)
+      {
         break;
       }
     }
 
-    if (ptr == nullptr) {
+    if (ptr == nullptr)
+    {
       const size_t alloc_size =
           std::max(size + alignment, m_default_arena_size);
       void* arena_ptr = m_alloc.malloc(alloc_size);
-      if (arena_ptr != nullptr) {
+      if (arena_ptr != nullptr)
+      {
         m_arenas.emplace_front(arena_ptr, alloc_size);
         ptr = m_arenas.front().get(size, alignment);
       }
@@ -380,16 +402,19 @@ class MemPool
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
 
-    void* ptr = const_cast<void*>(cptr);
+    void* ptr                          = const_cast<void*>(cptr);
     arena_container_type::iterator end = m_arenas.end();
     for (arena_container_type::iterator iter = m_arenas.begin(); iter != end;
-         ++iter) {
-      if (iter->give(ptr)) {
+         ++iter)
+    {
+      if (iter->give(ptr))
+      {
         ptr = nullptr;
         break;
       }
     }
-    if (ptr != nullptr) {
+    if (ptr != nullptr)
+    {
       fprintf(stderr, "Unknown pointer %p", ptr);
     }
   }
@@ -407,7 +432,8 @@ class MemPool
 };
 
 //! example allocator for basic_mempool using malloc/free
-struct generic_allocator {
+struct generic_allocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes) { return std::malloc(nbytes); }
diff --git a/include/RAJA/util/concepts.hpp b/include/RAJA/util/concepts.hpp
index 4372993949..06637e7a96 100644
--- a/include/RAJA/util/concepts.hpp
+++ b/include/RAJA/util/concepts.hpp
@@ -34,17 +34,17 @@ using namespace camp::concepts;
 
 template <typename From, typename To>
 struct ConvertibleTo
-  : DefineConcept(::RAJA::concepts::convertible_to<To>(camp::val<From>())) {
-};
+    : DefineConcept(::RAJA::concepts::convertible_to<To>(camp::val<From>()))
+{};
 
-}
+}  // namespace concepts
 
 namespace type_traits
 {
 using namespace camp::type_traits;
 
 DefineTypeTraitFromConcept(convertible_to, concepts::ConvertibleTo);
-}
+}  // namespace type_traits
 
 }  // end namespace RAJA
 
diff --git a/include/RAJA/util/for_each.hpp b/include/RAJA/util/for_each.hpp
index 25783b2a0a..81069d57d0 100644
--- a/include/RAJA/util/for_each.hpp
+++ b/include/RAJA/util/for_each.hpp
@@ -38,11 +38,13 @@ namespace detail
 
 // runtime loop applying func to each element in the range in order
 RAJA_SUPPRESS_HD_WARN
-template<typename Iter, typename UnaryFunc>
-RAJA_HOST_DEVICE RAJA_INLINE
-UnaryFunc for_each(Iter begin, Iter end, UnaryFunc func)
+template <typename Iter, typename UnaryFunc>
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each(Iter begin,
+                                                Iter end,
+                                                UnaryFunc func)
 {
-  for (; begin != end; ++begin) {
+  for (; begin != end; ++begin)
+  {
     func(*begin);
   }
 
@@ -52,11 +54,11 @@ UnaryFunc for_each(Iter begin, Iter end, UnaryFunc func)
 // compile time expansion applying func to a each type in the list in order
 RAJA_SUPPRESS_HD_WARN
 template <typename UnaryFunc, typename... Ts>
-RAJA_HOST_DEVICE RAJA_INLINE
-UnaryFunc for_each_type(camp::list<Ts...> const&, UnaryFunc func)
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_type(camp::list<Ts...> const&,
+                                                     UnaryFunc func)
 {
   // braced init lists are evaluated in order
-  int seq_unused_array[] = {0, (func(Ts{}), 0)...};
+  int seq_unused_array[] = {0, (func(Ts {}), 0)...};
   RAJA_UNUSED_VAR(seq_unused_array);
 
   return func;
@@ -65,8 +67,9 @@ UnaryFunc for_each_type(camp::list<Ts...> const&, UnaryFunc func)
 // compile time expansion applying func to a each type in the tuple in order
 RAJA_SUPPRESS_HD_WARN
 template <typename Tuple, typename UnaryFunc, camp::idx_t... Is>
-RAJA_HOST_DEVICE RAJA_INLINE
-UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func, camp::idx_seq<Is...>)
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_tuple(Tuple&& t,
+                                                      UnaryFunc func,
+                                                      camp::idx_seq<Is...>)
 {
   using camp::get;
   // braced init lists are evaluated in order
@@ -87,7 +90,7 @@ UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func, camp::idx_seq<Is...>)
 RAJA_SUPPRESS_HD_WARN
 template <typename Container, typename UnaryFunc>
 RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if_t<UnaryFunc, type_traits::is_range<Container>>
+    concepts::enable_if_t<UnaryFunc, type_traits::is_range<Container>>
     for_each(Container&& c, UnaryFunc func)
 {
   using std::begin;
@@ -102,23 +105,23 @@ concepts::enable_if_t<UnaryFunc, type_traits::is_range<Container>>
 */
 RAJA_SUPPRESS_HD_WARN
 template <typename UnaryFunc, typename... Ts>
-RAJA_HOST_DEVICE RAJA_INLINE
-UnaryFunc for_each_type(camp::list<Ts...> const& c, UnaryFunc func)
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_type(camp::list<Ts...> const& c,
+                                                     UnaryFunc func)
 {
   return detail::for_each_type(c, std::move(func));
 }
 
 /*!
-  \brief Apply func to each object in the given tuple or tuple like type in order
-  using a compile-time expansion in O(N) operations and O(1) extra memory
+  \brief Apply func to each object in the given tuple or tuple like type in
+  order using a compile-time expansion in O(N) operations and O(1) extra memory
 */
 RAJA_SUPPRESS_HD_WARN
 template <typename Tuple, typename UnaryFunc>
-RAJA_HOST_DEVICE RAJA_INLINE
-UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func)
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func)
 {
-  return detail::for_each_tuple(std::forward<Tuple>(t), std::move(func),
-      camp::make_idx_seq_t<std::tuple_size<camp::decay<Tuple>>::value>{});
+  return detail::for_each_tuple(
+      std::forward<Tuple>(t), std::move(func),
+      camp::make_idx_seq_t<std::tuple_size<camp::decay<Tuple>>::value> {});
 }
 
 }  // namespace RAJA
diff --git a/include/RAJA/util/macros.hpp b/include/RAJA/util/macros.hpp
index 9ddb5bebb7..dddb050ec4 100644
--- a/include/RAJA/util/macros.hpp
+++ b/include/RAJA/util/macros.hpp
@@ -33,16 +33,16 @@
 // We need a better solution than this as it is a pain to manage
 // this stuff in an application.
 //
-#if (defined(RAJA_ENABLE_CUDA) && defined(__CUDA_ARCH__)) \
-  || (defined(RAJA_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)) \
-  || (defined(RAJA_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__))
+#if (defined(RAJA_ENABLE_CUDA) && defined(__CUDA_ARCH__)) ||                   \
+    (defined(RAJA_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)) ||           \
+    (defined(RAJA_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__))
 #define RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE
 #endif
 
 #if defined(RAJA_ENABLE_CUDA) && defined(__CUDACC__)
 #define RAJA_HOST_DEVICE __host__ __device__
-#define RAJA_DEVICE __device__
-#define RAJA_HOST __host__
+#define RAJA_DEVICE      __device__
+#define RAJA_HOST        __host__
 
 #if defined(RAJA_ENABLE_CLANG_CUDA)
 #define RAJA_SUPPRESS_HD_WARN
@@ -52,8 +52,8 @@
 
 #elif defined(RAJA_ENABLE_HIP) && defined(__HIPCC__)
 #define RAJA_HOST_DEVICE __host__ __device__
-#define RAJA_DEVICE __device__
-#define RAJA_HOST __host__
+#define RAJA_DEVICE      __device__
+#define RAJA_HOST        __host__
 #define RAJA_SUPPRESS_HD_WARN
 
 #define RAJA_USE_HIP_INTRINSICS
@@ -115,9 +115,8 @@
  *******************************************************************************
  */
 template <typename... T>
-RAJA_HOST_DEVICE RAJA_INLINE void RAJA_UNUSED_VAR(T &&...) noexcept
-{
-}
+RAJA_HOST_DEVICE RAJA_INLINE void RAJA_UNUSED_VAR(T&&...) noexcept
+{}
 
 /*!
  * \def RAJA_STRINGIFY_HELPER(x)
@@ -133,7 +132,7 @@ RAJA_HOST_DEVICE RAJA_INLINE void RAJA_UNUSED_VAR(T &&...) noexcept
  */
 #define RAJA_STRINGIFY_MACRO(x) RAJA_STRINGIFY_HELPER(x)
 
-#define RAJA_DIVIDE_CEILING_INT(dividend, divisor) \
+#define RAJA_DIVIDE_CEILING_INT(dividend, divisor)                             \
   (((dividend) + (divisor)-1) / (divisor))
 
 /*!
@@ -141,27 +140,26 @@ RAJA_HOST_DEVICE RAJA_INLINE void RAJA_UNUSED_VAR(T &&...) noexcept
  * Used in forall and launch
  */
 #if defined(RAJA_ENABLE_OPENMP)
-#define RAJA_OMP_DECLARE_REDUCTION_COMBINE \
-      _Pragma(" omp declare reduction( combine \
+#define RAJA_OMP_DECLARE_REDUCTION_COMBINE                                     \
+  _Pragma(" omp declare reduction( combine \
         : typename std::remove_reference<decltype(f_params)>::type \
-        : RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(omp_out, omp_in) ) ")\
-        //initializer(omp_priv = omp_in) ")
+        : RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(omp_out, omp_in) ) ")  // initializer(omp_priv = omp_in) ")
 #endif
 
 
 RAJA_HOST_DEVICE
-inline void RAJA_ABORT_OR_THROW(const char *str)
+inline void RAJA_ABORT_OR_THROW(const char* str)
 {
 #if defined(__SYCL_DEVICE_ONLY__)
-  //segfault here ran into linking problems
-  *((volatile char *)0) = 0;  // write to address 0
+  // segfault here ran into linking problems
+  *((volatile char*)0) = 0;  // write to address 0
 #else
-  printf ( "%s\n", str );
+  printf("%s\n", str);
 #if defined(RAJA_ENABLE_TARGET_OPENMP) && (_OPENMP >= 201511)
   // seg faulting here instead of calling std::abort for omp target
-  *((volatile char *)0) = 0;  // write to address 0
+  *((volatile char*)0) = 0;  // write to address 0
 #elif defined(__CUDA_ARCH__)
-  asm ("trap;");
+  asm("trap;");
 
 #elif defined(__HIP_DEVICE_COMPILE__)
   abort();
@@ -169,10 +167,11 @@ inline void RAJA_ABORT_OR_THROW(const char *str)
 #else
 #ifdef RAJA_COMPILER_MSVC
   fflush(stdout);
-  char *value;
+  char* value;
   size_t len;
   bool no_except = false;
-  if(_dupenv_s(&value, &len, "RAJA_NO_EXCEPT") == 0 && value != nullptr){
+  if (_dupenv_s(&value, &len, "RAJA_NO_EXCEPT") == 0 && value != nullptr)
+  {
     no_except = true;
     free(value);
   }
@@ -182,9 +181,12 @@ inline void RAJA_ABORT_OR_THROW(const char *str)
 #endif
 
   fflush(stdout);
-  if (no_except) {
+  if (no_except)
+  {
     std::abort();
-  } else {
+  }
+  else
+  {
     throw std::runtime_error(str);
   }
 #endif
@@ -202,7 +204,7 @@ inline void RAJA_ABORT_OR_THROW(const char *str)
  */
 
 #if (__cplusplus >= 201402L)
-#define RAJA_HAS_CXX14 1
+#define RAJA_HAS_CXX14                    1
 #define RAJA_HAS_CXX_ATTRIBUTE_DEPRECATED 1
 #elif defined(__has_cpp_attribute)
 #if __has_cpp_attribute(deprecated)
@@ -212,7 +214,7 @@ inline void RAJA_ABORT_OR_THROW(const char *str)
 
 #if defined(RAJA_HAS_CXX_ATTRIBUTE_DEPRECATED)
 // When using a C++14 compiler, use the standard-specified deprecated attribute
-#define RAJA_DEPRECATE(Msg) [[deprecated(Msg)]]
+#define RAJA_DEPRECATE(Msg)       [[deprecated(Msg)]]
 #define RAJA_DEPRECATE_ALIAS(Msg) [[deprecated(Msg)]]
 
 #elif defined(_MSC_VER)
diff --git a/include/RAJA/util/math.hpp b/include/RAJA/util/math.hpp
index 99d7bc192e..cdc03b9db7 100644
--- a/include/RAJA/util/math.hpp
+++ b/include/RAJA/util/math.hpp
@@ -34,14 +34,14 @@ namespace RAJA
     For zero or negative n return 0
 
 */
-template < typename T,
-           std::enable_if_t<std::is_integral<T>::value>* = nullptr >
-RAJA_HOST_DEVICE RAJA_INLINE
-constexpr T log2(T n) noexcept
+template <typename T, std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr T log2(T n) noexcept
 {
   T result = 0;
-  if (n > 0) {
-    while(n >>= 1) {
+  if (n > 0)
+  {
+    while (n >>= 1)
+    {
       ++result;
     }
   }
@@ -57,13 +57,12 @@ constexpr T log2(T n) noexcept
         if n is not a power of 2, return the next greater power of 2
       if n is negative, return 0
 */
-template < typename T,
-           std::enable_if_t<std::is_integral<T>::value>* = nullptr >
-RAJA_HOST_DEVICE RAJA_INLINE
-constexpr T next_pow2(T n) noexcept
+template <typename T, std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr T next_pow2(T n) noexcept
 {
   --n;
-  for (size_t s = 1; s < CHAR_BIT*sizeof(T); s *= 2) {
+  for (size_t s = 1; s < CHAR_BIT * sizeof(T); s *= 2)
+  {
     n |= n >> s;
   }
   ++n;
@@ -71,7 +70,8 @@ constexpr T next_pow2(T n) noexcept
 }
 
 /*!
-    \brief "round down" to the largest power of 2 that is less than or equal to n
+    \brief "round down" to the largest power of 2 that is less than or equal to
+   n
 
     For an integer n,
       if n is negative, return 0
@@ -79,13 +79,12 @@ constexpr T next_pow2(T n) noexcept
         if n is a power of 2, return n
         else return the largest power of 2 that is less than n
 */
-template < typename T,
-           std::enable_if_t<std::is_integral<T>::value>* = nullptr >
-RAJA_HOST_DEVICE RAJA_INLINE
-constexpr T prev_pow2(T n) noexcept
+template <typename T, std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr T prev_pow2(T n) noexcept
 {
-  if ( n < 0 ) return 0;
-  for (size_t s = 1; s < CHAR_BIT*sizeof(T); s *= 2) {
+  if (n < 0) return 0;
+  for (size_t s = 1; s < CHAR_BIT * sizeof(T); s *= 2)
+  {
     n |= n >> s;
   }
   return n - (n >> 1);
@@ -94,12 +93,14 @@ constexpr T prev_pow2(T n) noexcept
 /*!
     \brief compute lhs mod rhs where lhs is non-negative and rhs is a power of 2
 */
-template < typename L, typename R,
-           std::enable_if_t<std::is_integral<L>::value && std::is_integral<R>::value>* = nullptr >
-RAJA_HOST_DEVICE RAJA_INLINE
-constexpr auto power_of_2_mod(L lhs, R rhs) noexcept
+template <typename L,
+          typename R,
+          std::enable_if_t<std::is_integral<L>::value &&
+                           std::is_integral<R>::value>* = nullptr>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto power_of_2_mod(L lhs,
+                                                           R rhs) noexcept
 {
-  return lhs & (rhs-R(1));
+  return lhs & (rhs - R(1));
 }
 
 }  // namespace RAJA
diff --git a/include/RAJA/util/mutex.hpp b/include/RAJA/util/mutex.hpp
index a955b27915..631177cbf6 100644
--- a/include/RAJA/util/mutex.hpp
+++ b/include/RAJA/util/mutex.hpp
@@ -39,10 +39,10 @@ class mutex
 
   mutex() { omp_init_lock(&m_lock); }
 
-  mutex(const mutex&) = delete;
-  mutex(mutex&&) = delete;
+  mutex(const mutex&)            = delete;
+  mutex(mutex&&)                 = delete;
   mutex& operator=(const mutex&) = delete;
-  mutex& operator=(mutex&&) = delete;
+  mutex& operator=(mutex&&)      = delete;
 
   void lock() { omp_set_lock(&m_lock); }
 
@@ -68,10 +68,10 @@ class lock_guard
 public:
   explicit lock_guard(mutex_type& m) : m_mutex(m) { m_mutex.lock(); }
 
-  lock_guard(const lock_guard&) = delete;
-  lock_guard(lock_guard&&) = delete;
+  lock_guard(const lock_guard&)            = delete;
+  lock_guard(lock_guard&&)                 = delete;
   lock_guard& operator=(const lock_guard&) = delete;
-  lock_guard& operator=(lock_guard&&) = delete;
+  lock_guard& operator=(lock_guard&&)      = delete;
 
   ~lock_guard() { m_mutex.unlock(); }
 
diff --git a/include/RAJA/util/plugins.hpp b/include/RAJA/util/plugins.hpp
index d5f42efde0..301bbc875c 100644
--- a/include/RAJA/util/plugins.hpp
+++ b/include/RAJA/util/plugins.hpp
@@ -18,103 +18,88 @@
 #include "RAJA/util/KokkosPluginLoader.hpp"
 #endif
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
 template <typename T>
-RAJA_INLINE auto trigger_updates_before(T&& item)
-  -> typename std::remove_reference<T>::type
+RAJA_INLINE auto trigger_updates_before(T&& item) ->
+    typename std::remove_reference<T>::type
 {
   return item;
 }
 
 RAJA_INLINE
-void
-callPreCapturePlugins(const PluginContext& p)
+void callPreCapturePlugins(const PluginContext& p)
 {
-  for (auto plugin = PluginRegistry::begin();
-      plugin != PluginRegistry::end();
-      ++plugin)
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin)
   {
     (*plugin).get()->preCapture(p);
   }
 }
 
 RAJA_INLINE
-void
-callPostCapturePlugins(const PluginContext& p)
+void callPostCapturePlugins(const PluginContext& p)
 {
-  for (auto plugin = PluginRegistry::begin();
-      plugin != PluginRegistry::end();
-      ++plugin)
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin)
   {
     (*plugin).get()->postCapture(p);
   }
 }
 
 RAJA_INLINE
-void
-callPreLaunchPlugins(const PluginContext& p)
+void callPreLaunchPlugins(const PluginContext& p)
 {
-  for (auto plugin = PluginRegistry::begin();
-      plugin != PluginRegistry::end();
-      ++plugin)
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin)
   {
     (*plugin).get()->preLaunch(p);
   }
 }
 
 RAJA_INLINE
-void
-callPostLaunchPlugins(const PluginContext& p)
+void callPostLaunchPlugins(const PluginContext& p)
 {
-  for (auto plugin = PluginRegistry::begin();
-      plugin != PluginRegistry::end();
-      ++plugin)
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin)
   {
     (*plugin).get()->postLaunch(p);
   }
 }
 
 RAJA_INLINE
-void
-callInitPlugins(const PluginOptions p)
+void callInitPlugins(const PluginOptions p)
 {
-  for (auto plugin = PluginRegistry::begin(); 
-      plugin != PluginRegistry::end();
-      ++plugin)
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin)
   {
     (*plugin).get()->init(p);
   }
 }
 
 RAJA_INLINE
-void
-init_plugins(const std::string& path)
-{   
+void init_plugins(const std::string& path)
+{
   callInitPlugins(make_options(path));
 }
 
 RAJA_INLINE
-void
-init_plugins()
-{   
-  callInitPlugins(make_options(""));
-}
+void init_plugins() { callInitPlugins(make_options("")); }
 
 RAJA_INLINE
-void
-finalize_plugins()
-{   
-  for (auto plugin = PluginRegistry::begin(); 
-    plugin != PluginRegistry::end();
-    ++plugin)
+void finalize_plugins()
+{
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin)
   {
     (*plugin).get()->finalize();
   }
 }
 
-} // closing brace for util namespace
-} // closing brace for RAJA namespace
+}  // namespace util
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/reduce.hpp b/include/RAJA/util/reduce.hpp
index 6d0c28f861..de25c2005a 100644
--- a/include/RAJA/util/reduce.hpp
+++ b/include/RAJA/util/reduce.hpp
@@ -44,19 +44,16 @@ namespace detail
 template <typename T, typename BinaryOp>
 struct LeftFoldReduce
 {
-  RAJA_HOST_DEVICE RAJA_INLINE
-  constexpr explicit LeftFoldReduce(T init = BinaryOp::identity(),
-                                      BinaryOp op = BinaryOp{}) noexcept
-    : m_op(std::move(op))
-    , m_accumulated_value(std::move(init))
-  {
-
-  }
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr explicit LeftFoldReduce(
+      T init      = BinaryOp::identity(),
+      BinaryOp op = BinaryOp {}) noexcept
+      : m_op(std::move(op)), m_accumulated_value(std::move(init))
+  {}
 
-  LeftFoldReduce(LeftFoldReduce const&) = delete;
+  LeftFoldReduce(LeftFoldReduce const&)            = delete;
   LeftFoldReduce& operator=(LeftFoldReduce const&) = delete;
-  LeftFoldReduce(LeftFoldReduce &&) = delete;
-  LeftFoldReduce& operator=(LeftFoldReduce &&) = delete;
+  LeftFoldReduce(LeftFoldReduce&&)                 = delete;
+  LeftFoldReduce& operator=(LeftFoldReduce&&)      = delete;
 
   ~LeftFoldReduce() = default;
 
@@ -64,8 +61,7 @@ struct LeftFoldReduce
   /*!
       \brief reset the combined value of the reducer to the identity
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  void clear() noexcept
+  RAJA_HOST_DEVICE RAJA_INLINE void clear() noexcept
   {
     m_accumulated_value = BinaryOp::identity();
   }
@@ -73,8 +69,7 @@ struct LeftFoldReduce
   /*!
       \brief return the combined value and clear the reducer
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  T get_and_clear()
+  RAJA_HOST_DEVICE RAJA_INLINE T get_and_clear()
   {
     T accumulated_value = std::move(m_accumulated_value);
 
@@ -86,17 +81,12 @@ struct LeftFoldReduce
   /*!
       \brief return the combined value
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  T get()
-  {
-    return m_accumulated_value;
-  }
+  RAJA_HOST_DEVICE RAJA_INLINE T get() { return m_accumulated_value; }
 
   /*!
       \brief combine a value into the reducer
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  void combine(T val)
+  RAJA_HOST_DEVICE RAJA_INLINE void combine(T val)
   {
     m_accumulated_value = m_op(std::move(m_accumulated_value), std::move(val));
   }
@@ -109,50 +99,50 @@ struct LeftFoldReduce
 /*!
     \brief Reduce class that does a reduction with a binary tree.
 */
-template <typename T, typename BinaryOp, typename SizeType = size_t,
-          SizeType t_num_levels = CHAR_BIT*sizeof(SizeType)>
+template <typename T,
+          typename BinaryOp,
+          typename SizeType     = size_t,
+          SizeType t_num_levels = CHAR_BIT * sizeof(SizeType)>
 struct BinaryTreeReduce
 {
   static_assert(std::is_unsigned<SizeType>::value, "SizeType must be unsigned");
-  static_assert(t_num_levels <= CHAR_BIT*sizeof(SizeType), "SizeType must be large enough to act at a bitset for num_levels");
+  static_assert(
+      t_num_levels <= CHAR_BIT * sizeof(SizeType),
+      "SizeType must be large enough to act at a bitset for num_levels");
 
   static constexpr SizeType num_levels = t_num_levels;
 
-  RAJA_HOST_DEVICE RAJA_INLINE
-  constexpr explicit BinaryTreeReduce(T init = BinaryOp::identity(),
-                                      BinaryOp op = BinaryOp{}) noexcept
-    : m_op(std::move(op))
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr explicit BinaryTreeReduce(
+      T init      = BinaryOp::identity(),
+      BinaryOp op = BinaryOp {}) noexcept
+      : m_op(std::move(op))
   {
     combine(std::move(init));
   }
 
-  BinaryTreeReduce(BinaryTreeReduce const&) = delete;
+  BinaryTreeReduce(BinaryTreeReduce const&)            = delete;
   BinaryTreeReduce& operator=(BinaryTreeReduce const&) = delete;
-  BinaryTreeReduce(BinaryTreeReduce &&) = delete;
-  BinaryTreeReduce& operator=(BinaryTreeReduce &&) = delete;
+  BinaryTreeReduce(BinaryTreeReduce&&)                 = delete;
+  BinaryTreeReduce& operator=(BinaryTreeReduce&&)      = delete;
 
-  RAJA_HOST_DEVICE RAJA_INLINE
-  ~BinaryTreeReduce()
-  {
-    clear();
-  }
+  RAJA_HOST_DEVICE RAJA_INLINE ~BinaryTreeReduce() { clear(); }
 
 
   /*!
       \brief reset the combined value of the reducer to the identity
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  void clear() noexcept
+  RAJA_HOST_DEVICE RAJA_INLINE void clear() noexcept
   {
     // destroy all values on the tree stack and reset count to 0
-    for (SizeType level = 0, mask = 1; m_count; ++level, mask <<= 1) {
+    for (SizeType level = 0, mask = 1; m_count; ++level, mask <<= 1)
+    {
 
-      if (m_count & mask) {
+      if (m_count & mask)
+      {
 
         get_value(level)->~T();
 
         m_count ^= mask;
-
       }
     }
   }
@@ -160,15 +150,16 @@ struct BinaryTreeReduce
   /*!
       \brief return the combined value and clear the reducer
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  T get_and_clear()
+  RAJA_HOST_DEVICE RAJA_INLINE T get_and_clear()
   {
     // accumulate all values
     T value = BinaryOp::identity();
 
-    for (SizeType level = 0, mask = 1; m_count; ++level, mask <<= 1) {
+    for (SizeType level = 0, mask = 1; m_count; ++level, mask <<= 1)
+    {
 
-      if (m_count & mask) {
+      if (m_count & mask)
+      {
 
         value = m_op(std::move(value), std::move(*get_value(level)));
         get_value(level)->~T();
@@ -183,15 +174,17 @@ struct BinaryTreeReduce
   /*!
       \brief return the combined value
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  T get()
+  RAJA_HOST_DEVICE RAJA_INLINE T get()
   {
     // accumulate all values
     T value = BinaryOp::identity();
 
-    for (SizeType count = m_count, level = 0, mask = 1; count; ++level, mask <<= 1) {
+    for (SizeType count = m_count, level = 0, mask = 1; count;
+         ++level, mask <<= 1)
+    {
 
-      if (count & mask) {
+      if (count & mask)
+      {
 
         value = m_op(std::move(value), *get_value(level));
 
@@ -205,20 +198,19 @@ struct BinaryTreeReduce
   /*!
       \brief combine a value into the reducer
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  void combine(T value)
+  RAJA_HOST_DEVICE RAJA_INLINE void combine(T value)
   {
     // accumulate values and store in the first unused level found
     // clear values from used levels along the way
     SizeType level = 0;
-    for (SizeType mask = 1; m_count & mask; ++level, mask <<= 1) {
+    for (SizeType mask = 1; m_count & mask; ++level, mask <<= 1)
+    {
 
       value = m_op(std::move(*get_value(level)), std::move(value));
       get_value(level)->~T();
-
     }
 
-    new(get_storage(level)) T(std::move(value));
+    new (get_storage(level)) T(std::move(value));
 
     ++m_count;
   }
@@ -234,14 +226,12 @@ struct BinaryTreeReduce
   // values or is unused and has no value.
   std::aligned_storage_t<sizeof(T), alignof(T)> m_tree_stack[num_levels];
 
-  RAJA_HOST_DEVICE RAJA_INLINE
-  void* get_storage(SizeType level)
+  RAJA_HOST_DEVICE RAJA_INLINE void* get_storage(SizeType level)
   {
     return &m_tree_stack[level];
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE
-  T* get_value(SizeType level)
+  RAJA_HOST_DEVICE RAJA_INLINE T* get_value(SizeType level)
   {
 #if __cplusplus >= 201703L && !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
     // TODO: check that launder is supported in device code
@@ -254,10 +244,10 @@ struct BinaryTreeReduce
 
 
 template <typename T, typename BinaryOp>
-using HighAccuracyReduce = std::conditional_t<
-    RAJA::operators::is_fp_associative<T>::value,
-      BinaryTreeReduce<T, BinaryOp>,
-      LeftFoldReduce<T, BinaryOp>>;
+using HighAccuracyReduce =
+    std::conditional_t<RAJA::operators::is_fp_associative<T>::value,
+                       BinaryTreeReduce<T, BinaryOp>,
+                       LeftFoldReduce<T, BinaryOp>>;
 
 
 /*!
@@ -265,18 +255,15 @@ using HighAccuracyReduce = std::conditional_t<
            operation using O(N) operations and O(1) memory
 */
 template <typename Iter, typename T, typename BinaryOp>
-RAJA_HOST_DEVICE RAJA_INLINE
-T left_fold_reduce(Iter begin,
-                   Iter end,
-                   T init,
-                   BinaryOp op)
+RAJA_HOST_DEVICE RAJA_INLINE T
+left_fold_reduce(Iter begin, Iter end, T init, BinaryOp op)
 {
   LeftFoldReduce<T, BinaryOp> reducer(std::move(init), std::move(op));
 
-  for (; begin != end; ++begin) {
+  for (; begin != end; ++begin)
+  {
 
     reducer.combine(*begin);
-
   }
 
   return reducer.get_and_clear();
@@ -290,20 +277,18 @@ T left_fold_reduce(Iter begin,
     floating point types.
 */
 template <typename Iter, typename T, typename BinaryOp>
-RAJA_HOST_DEVICE RAJA_INLINE
-T binary_tree_reduce(Iter begin,
-                     Iter end,
-                     T init,
-                     BinaryOp op)
+RAJA_HOST_DEVICE RAJA_INLINE T
+binary_tree_reduce(Iter begin, Iter end, T init, BinaryOp op)
 {
   using std::distance;
   using SizeType = std::make_unsigned_t<decltype(distance(begin, end))>;
-  BinaryTreeReduce<T, BinaryOp, SizeType> reducer(std::move(init), std::move(op));
+  BinaryTreeReduce<T, BinaryOp, SizeType> reducer(std::move(init),
+                                                  std::move(op));
 
-  for (; begin != end; ++begin) {
+  for (; begin != end; ++begin)
+  {
 
     reducer.combine(*begin);
-
   }
 
   return reducer.get_and_clear();
@@ -315,18 +300,15 @@ T binary_tree_reduce(Iter begin,
     is a concern, or a faster algorithm with it is not a concern
 */
 template <typename Iter, typename T, typename BinaryOp>
-RAJA_HOST_DEVICE RAJA_INLINE
-T high_accuracy_reduce(Iter begin,
-                        Iter end,
-                        T init,
-                        BinaryOp op)
+RAJA_HOST_DEVICE RAJA_INLINE T
+high_accuracy_reduce(Iter begin, Iter end, T init, BinaryOp op)
 {
   HighAccuracyReduce<T, BinaryOp> reducer(std::move(init), std::move(op));
 
-  for (; begin != end; ++begin) {
+  for (; begin != end; ++begin)
+  {
 
     reducer.combine(*begin);
-
   }
 
   return reducer.get_and_clear();
@@ -340,18 +322,21 @@ T high_accuracy_reduce(Iter begin,
     see https://en.cppreference.com/w/cpp/algorithm/accumulate
 */
 template <typename Container,
-          typename T = detail::ContainerVal<Container>,
+          typename T        = detail::ContainerVal<Container>,
           typename BinaryOp = operators::plus<T>>
-RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if_t<T, type_traits::is_range<Container>>
-    accumulate(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{})
+RAJA_HOST_DEVICE
+    RAJA_INLINE concepts::enable_if_t<T, type_traits::is_range<Container>>
+    accumulate(Container&& c,
+               T init      = BinaryOp::identity(),
+               BinaryOp op = BinaryOp {})
 {
   using std::begin;
   using std::end;
   static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
                 "BinaryOp must model BinaryFunction");
 
-  return detail::left_fold_reduce(begin(c), end(c), std::move(init), std::move(op));
+  return detail::left_fold_reduce(begin(c), end(c), std::move(init),
+                                  std::move(op));
 }
 
 /*!
@@ -360,18 +345,21 @@ concepts::enable_if_t<T, type_traits::is_range<Container>>
     see https://en.cppreference.com/w/cpp/algorithm/reduce
 */
 template <typename Container,
-          typename T = detail::ContainerVal<Container>,
+          typename T        = detail::ContainerVal<Container>,
           typename BinaryOp = operators::plus<T>>
-RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if_t<T, type_traits::is_range<Container>>
-    binary_tree_reduce(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{})
+RAJA_HOST_DEVICE
+    RAJA_INLINE concepts::enable_if_t<T, type_traits::is_range<Container>>
+    binary_tree_reduce(Container&& c,
+                       T init      = BinaryOp::identity(),
+                       BinaryOp op = BinaryOp {})
 {
   using std::begin;
   using std::end;
   static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
                 "BinaryOp must model BinaryFunction");
 
-  return detail::binary_tree_reduce(begin(c), end(c), std::move(init), std::move(op));
+  return detail::binary_tree_reduce(begin(c), end(c), std::move(init),
+                                    std::move(op));
 }
 
 /*!
@@ -381,18 +369,21 @@ concepts::enable_if_t<T, type_traits::is_range<Container>>
     see https://en.cppreference.com/w/cpp/algorithm/reduce
 */
 template <typename Container,
-          typename T = detail::ContainerVal<Container>,
+          typename T        = detail::ContainerVal<Container>,
           typename BinaryOp = operators::plus<T>>
-RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if_t<T, type_traits::is_range<Container>>
-    high_accuracy_reduce(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{})
+RAJA_HOST_DEVICE
+    RAJA_INLINE concepts::enable_if_t<T, type_traits::is_range<Container>>
+    high_accuracy_reduce(Container&& c,
+                         T init      = BinaryOp::identity(),
+                         BinaryOp op = BinaryOp {})
 {
   using std::begin;
   using std::end;
   static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
                 "BinaryOp must model BinaryFunction");
 
-  return detail::high_accuracy_reduce(begin(c), end(c), std::move(init), std::move(op));
+  return detail::high_accuracy_reduce(begin(c), end(c), std::move(init),
+                                      std::move(op));
 }
 
 }  // namespace RAJA
diff --git a/include/RAJA/util/resource.hpp b/include/RAJA/util/resource.hpp
index 28a476d951..567d95e21e 100644
--- a/include/RAJA/util/resource.hpp
+++ b/include/RAJA/util/resource.hpp
@@ -37,145 +37,212 @@
 namespace RAJA
 {
 
-  namespace resources
-  {
-  using namespace camp::resources;
+namespace resources
+{
+using namespace camp::resources;
 
-  template<typename e>
-  struct get_resource{
-    using type = camp::resources::Host;
-  };
+template <typename e>
+struct get_resource
+{
+  using type = camp::resources::Host;
+};
 
-  template<Platform>
-  struct get_resource_from_platform{
-    using type = camp::resources::Host;
-  };
+template <Platform>
+struct get_resource_from_platform
+{
+  using type = camp::resources::Host;
+};
 
-  template<typename ExecPol>
-  using resource_from_pol_t = typename get_resource_from_platform<detail::get_platform<ExecPol>::value>::type;
+template <typename ExecPol>
+using resource_from_pol_t = typename get_resource_from_platform<
+    detail::get_platform<ExecPol>::value>::type;
 
-  template<typename ExecPol>
-  constexpr resource_from_pol_t<ExecPol> get_default_resource() {
-    return resource_from_pol_t<ExecPol>::get_default();
-  }
+template <typename ExecPol>
+constexpr resource_from_pol_t<ExecPol> get_default_resource()
+{
+  return resource_from_pol_t<ExecPol>::get_default();
+}
 
 #if defined(RAJA_CUDA_ACTIVE)
-  template<>
-  struct get_resource_from_platform<Platform::cuda>{
-    using type = camp::resources::Cuda;
-  };
-
-  template<typename IterationMapping, typename IterationGetter,
-           typename Concretizer, size_t BLOCKS_PER_SM, bool Async>
-  struct get_resource<::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>>{
-    using type = camp::resources::Cuda;
-  };
-
-  template <bool Async, int num_threads, size_t BLOCKS_PER_SM>
-  struct get_resource<::RAJA::policy::cuda::cuda_launch_explicit_t<Async, num_threads, BLOCKS_PER_SM>>{
-    using type = camp::resources::Cuda;
-  };
-
-  template<typename ISetIter, typename IterationMapping, typename IterationGetter,
-           typename Concretizer, size_t BLOCKS_PER_SM, bool Async>
-  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>>>{
-    using type = camp::resources::Cuda;
-  };
+template <>
+struct get_resource_from_platform<Platform::cuda>
+{
+  using type = camp::resources::Cuda;
+};
+
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async>
+struct get_resource<::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                             IterationGetter,
+                                                             Concretizer,
+                                                             BLOCKS_PER_SM,
+                                                             Async>>
+{
+  using type = camp::resources::Cuda;
+};
+
+template <bool Async, int num_threads, size_t BLOCKS_PER_SM>
+struct get_resource<::RAJA::policy::cuda::cuda_launch_explicit_t<Async,
+                                                                 num_threads,
+                                                                 BLOCKS_PER_SM>>
+{
+  using type = camp::resources::Cuda;
+};
+
+template <typename ISetIter,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async>
+struct get_resource<
+    ExecPolicy<ISetIter,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BLOCKS_PER_SM,
+                                                        Async>>>
+{
+  using type = camp::resources::Cuda;
+};
 #endif
 
 #if defined(RAJA_HIP_ACTIVE)
-  template<>
-  struct get_resource_from_platform<Platform::hip>{
-    using type = camp::resources::Hip;
-  };
-
-  template<typename IterationMapping, typename IterationGetter,
-           typename Concretizer, bool Async>
-  struct get_resource<::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>{
-    using type = camp::resources::Hip;
-  };
-
-  template <bool Async, int num_threads>
-  struct get_resource<::RAJA::policy::hip::hip_launch_t<Async, num_threads>>{
-    using type = camp::resources::Hip;
-  };
-
-  template<typename ISetIter, typename IterationMapping, typename IterationGetter,
-           typename Concretizer, bool Async>
-  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>>{
-    using type = camp::resources::Hip;
-  };
+template <>
+struct get_resource_from_platform<Platform::hip>
+{
+  using type = camp::resources::Hip;
+};
+
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async>
+struct get_resource<
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>
+{
+  using type = camp::resources::Hip;
+};
+
+template <bool Async, int num_threads>
+struct get_resource<::RAJA::policy::hip::hip_launch_t<Async, num_threads>>
+{
+  using type = camp::resources::Hip;
+};
+
+template <typename ISetIter,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async>
+struct get_resource<ExecPolicy<
+    ISetIter,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>>
+{
+  using type = camp::resources::Hip;
+};
 #endif
 
 #if defined(RAJA_SYCL_ACTIVE)
-  template<>
-  struct get_resource_from_platform<Platform::sycl>{
-    using type = camp::resources::Sycl;
-  };
-
-  template<size_t BlockSize, bool Async>
-  struct get_resource<::RAJA::policy::sycl::sycl_exec<BlockSize, Async>>{
-    using type = camp::resources::Sycl;
-  };
-
-  template <bool Async, int num_threads>
-  struct get_resource<::RAJA::policy::sycl::sycl_launch_t<Async, num_threads>>{
-    using type = camp::resources::Sycl;
-  };
-
-  template<typename ISetIter, size_t BlockSize, bool Async>
-  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::sycl::sycl_exec<BlockSize, Async>>>{
-    using type = camp::resources::Sycl;
-  };
+template <>
+struct get_resource_from_platform<Platform::sycl>
+{
+  using type = camp::resources::Sycl;
+};
+
+template <size_t BlockSize, bool Async>
+struct get_resource<::RAJA::policy::sycl::sycl_exec<BlockSize, Async>>
+{
+  using type = camp::resources::Sycl;
+};
+
+template <bool Async, int num_threads>
+struct get_resource<::RAJA::policy::sycl::sycl_launch_t<Async, num_threads>>
+{
+  using type = camp::resources::Sycl;
+};
+
+template <typename ISetIter, size_t BlockSize, bool Async>
+struct get_resource<
+    ExecPolicy<ISetIter, ::RAJA::policy::sycl::sycl_exec<BlockSize, Async>>>
+{
+  using type = camp::resources::Sycl;
+};
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-  template<>
-  struct get_resource_from_platform<Platform::omp_target>{
-    using type = camp::resources::Omp;
-  };
-
-  template<>
-  struct get_resource<::RAJA::policy::omp::omp_target_parallel_for_exec_nt>{
-    using type = camp::resources::Omp;
-  };
-
-  template<size_t ThreadsPerTeam>
-  struct get_resource<::RAJA::policy::omp::omp_target_parallel_for_exec<ThreadsPerTeam>>{
-    using type = camp::resources::Omp;
-  };
-
-  template<typename ISetIter>
-  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::omp::omp_target_parallel_for_exec_nt>>{
-    using type = camp::resources::Omp;
-  };
-
-  template<typename ISetIter, size_t ThreadsPerTeam>
-  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::omp::omp_target_parallel_for_exec<ThreadsPerTeam>>>{
-    using type = camp::resources::Omp;
-  };
+template <>
+struct get_resource_from_platform<Platform::omp_target>
+{
+  using type = camp::resources::Omp;
+};
+
+template <>
+struct get_resource<::RAJA::policy::omp::omp_target_parallel_for_exec_nt>
+{
+  using type = camp::resources::Omp;
+};
+
+template <size_t ThreadsPerTeam>
+struct get_resource<
+    ::RAJA::policy::omp::omp_target_parallel_for_exec<ThreadsPerTeam>>
+{
+  using type = camp::resources::Omp;
+};
+
+template <typename ISetIter>
+struct get_resource<
+    ExecPolicy<ISetIter, ::RAJA::policy::omp::omp_target_parallel_for_exec_nt>>
+{
+  using type = camp::resources::Omp;
+};
+
+template <typename ISetIter, size_t ThreadsPerTeam>
+struct get_resource<ExecPolicy<
+    ISetIter,
+    ::RAJA::policy::omp::omp_target_parallel_for_exec<ThreadsPerTeam>>>
+{
+  using type = camp::resources::Omp;
+};
 #endif
 
-  } // end namespace resources
+}  // end namespace resources
 
-  namespace type_traits
-  {
-    template <typename T> struct is_resource : std::false_type {};
-    template <> struct is_resource<resources::Host> : std::true_type {};
+namespace type_traits
+{
+template <typename T>
+struct is_resource : std::false_type
+{};
+template <>
+struct is_resource<resources::Host> : std::true_type
+{};
 #if defined(RAJA_CUDA_ACTIVE)
-    template <> struct is_resource<resources::Cuda> : std::true_type {};
+template <>
+struct is_resource<resources::Cuda> : std::true_type
+{};
 #endif
 #if defined(RAJA_HIP_ACTIVE)
-    template <> struct is_resource<resources::Hip> : std::true_type {};
+template <>
+struct is_resource<resources::Hip> : std::true_type
+{};
 #endif
 #if defined(RAJA_SYCL_ACTIVE)
-    template <> struct is_resource<resources::Sycl> : std::true_type {};
+template <>
+struct is_resource<resources::Sycl> : std::true_type
+{};
 #endif
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-    template <> struct is_resource<resources::Omp> : std::true_type {};
+template <>
+struct is_resource<resources::Omp> : std::true_type
+{};
 #endif
-  } // end namespace type_traits
+}  // end namespace type_traits
 
 }  // end namespace RAJA
 
-#endif //RAJA_resources_HPP#
+#endif  // RAJA_resources_HPP#
diff --git a/include/RAJA/util/sort.hpp b/include/RAJA/util/sort.hpp
index bbec03dfe1..a5c0de5e76 100644
--- a/include/RAJA/util/sort.hpp
+++ b/include/RAJA/util/sort.hpp
@@ -40,37 +40,41 @@ namespace detail
     and using O(N) predicate evaluations and O(1) memory
 */
 template <typename Iter, typename Predicate>
-RAJA_HOST_DEVICE RAJA_INLINE
-Iter
-partition(Iter begin,
-          Iter end,
-          Predicate pred)
+RAJA_HOST_DEVICE RAJA_INLINE Iter partition(Iter begin,
+                                            Iter end,
+                                            Predicate pred)
 {
   using ::RAJA::safe_iter_swap;
 
-  if (begin == end) {
+  if (begin == end)
+  {
     return begin;
   }
 
   // advance to first false
   Iter first_false = begin;
-  for (; first_false != end; ++first_false) {
+  for (; first_false != end; ++first_false)
+  {
 
-    if (!pred(first_false)) {
+    if (!pred(first_false))
+    {
       break;
     }
   }
 
   // return if none were false
-  if (first_false == end) {
+  if (first_false == end)
+  {
     return first_false;
   }
 
   // advance through rest of list to find the next true
-  for (Iter next_true = RAJA::next(first_false); next_true != end; ++next_true) {
+  for (Iter next_true = RAJA::next(first_false); next_true != end; ++next_true)
+  {
 
     // find the end of a range of falses [first_false, next_true)
-    if (pred(next_true)) {
+    if (pred(next_true))
+    {
 
       // shift the known range of falses forward
       // by swapping the true to the beginning of the range
@@ -87,33 +91,36 @@ partition(Iter begin,
     and using O(N^2) comparisons and O(1) memory
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE RAJA_INLINE
-void
-insertion_sort(Iter begin,
-               Iter end,
-               Compare comp)
+RAJA_HOST_DEVICE RAJA_INLINE void
+insertion_sort(Iter begin, Iter end, Compare comp)
 {
   using ::RAJA::safe_iter_swap;
 
-  if (begin == end) {
+  if (begin == end)
+  {
     return;
   }
 
   // for each unsorted item in the right side of the range
-  for (Iter next_unsorted = RAJA::next(begin); next_unsorted != end; ++next_unsorted) {
+  for (Iter next_unsorted = RAJA::next(begin); next_unsorted != end;
+       ++next_unsorted)
+  {
 
     // insert unsorted item into the sorted left side of the range
-    for (Iter to_insert = next_unsorted; to_insert != begin; --to_insert) {
+    for (Iter to_insert = next_unsorted; to_insert != begin; --to_insert)
+    {
 
       Iter next_sorted = RAJA::prev(to_insert);
 
       // compare with next item to left
-      if (comp(*to_insert, *next_sorted)) {
+      if (comp(*to_insert, *next_sorted))
+      {
 
         // swap down if should be before
         safe_iter_swap(next_sorted, to_insert);
-
-      } else {
+      }
+      else
+      {
 
         // stop if in correct position
         break;
@@ -125,20 +132,16 @@ insertion_sort(Iter begin,
 /*!
     \brief get number of strides for shell sort
 */
-RAJA_HOST_DEVICE RAJA_INLINE
-constexpr size_t num_shell_strides()
-{
-  return 39;
-}
+RAJA_HOST_DEVICE RAJA_INLINE constexpr size_t num_shell_strides() { return 39; }
 
 /*!
     \brief get strides for shell sort
 */
-RAJA_HOST_DEVICE RAJA_INLINE
-constexpr long long unsigned get_shell_stride(int i)
+RAJA_HOST_DEVICE RAJA_INLINE constexpr long long unsigned
+get_shell_stride(int i)
 {
   using array_type = long long unsigned[num_shell_strides()];
-  return (array_type{
+  return (array_type {
       // strides from M. Ciura 2001
       1llu, 4llu, 10llu, 23llu, 57llu, 132llu, 301llu, 701llu, 1750llu,
       // extended up to 2^47 with strides[n] = floor(2.25*strides[n-1])
@@ -147,8 +150,8 @@ constexpr long long unsigned get_shell_stride(int i)
       149109795llu, 335497038llu, 754868335llu, 1698453753llu, 3821520944llu,
       8598422124llu, 19346449779llu, 43529512002llu, 97941402004llu,
       220368154509llu, 495828347645llu, 1115613782201llu, 2510131009952llu,
-      5647794772392llu, 12707538237882llu, 28591961035234llu, 64331912329276llu
-    })[i];
+      5647794772392llu, 12707538237882llu, 28591961035234llu,
+      64331912329276llu})[i];
 }
 
 /*!
@@ -156,26 +159,27 @@ constexpr long long unsigned get_shell_stride(int i)
     and using O(N^?) comparisons and O(1) memory
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE RAJA_INLINE
-void
-shell_sort(Iter begin,
-           Iter end,
-           Compare comp)
+RAJA_HOST_DEVICE RAJA_INLINE void shell_sort(Iter begin, Iter end, Compare comp)
 {
   using ::RAJA::safe_iter_swap;
   using diff_type = ::RAJA::detail::IterDiff<Iter>;
 
   diff_type n = end - begin;
 
-  if (n <= static_cast<diff_type>(1)) {
+  if (n <= static_cast<diff_type>(1))
+  {
     return;
-  } else if (get_shell_stride(1) < static_cast<unsigned long long>(n)) {
+  }
+  else if (get_shell_stride(1) < static_cast<unsigned long long>(n))
+  {
 
     int i_stride = 2;
     // find first stride larger than n
     constexpr int num_strides = num_shell_strides();
-    for (; i_stride < num_strides; ++i_stride) {
-      if (get_shell_stride(i_stride) >= static_cast<unsigned long long>(n)) {
+    for (; i_stride < num_strides; ++i_stride)
+    {
+      if (get_shell_stride(i_stride) >= static_cast<unsigned long long>(n))
+      {
         break;
       }
     }
@@ -184,25 +188,32 @@ shell_sort(Iter begin,
 
     // for each stride size smaller than n, largest to smallest, not including 1
     // sort strided ranges with stride stride
-    for (; i_stride > 0; --i_stride) {
+    for (; i_stride > 0; --i_stride)
+    {
       diff_type stride = static_cast<diff_type>(get_shell_stride(i_stride));
 
       // for each unsorted item in the right side of each strided range
-      for (diff_type i_next_unsorted = stride; i_next_unsorted != n; ++i_next_unsorted) {
+      for (diff_type i_next_unsorted = stride; i_next_unsorted != n;
+           ++i_next_unsorted)
+      {
 
         // insert unsorted item into the sorted left side of the strided range
-        for (diff_type i_to_insert = i_next_unsorted; i_to_insert >= stride; i_to_insert -= stride) {
+        for (diff_type i_to_insert = i_next_unsorted; i_to_insert >= stride;
+             i_to_insert -= stride)
+        {
 
-          Iter to_insert = begin + i_to_insert;
+          Iter to_insert   = begin + i_to_insert;
           Iter next_sorted = to_insert - stride;
 
           // compare with next item to left
-          if (comp(*to_insert, *next_sorted)) {
+          if (comp(*to_insert, *next_sorted))
+          {
 
             // swap down if should be before
             safe_iter_swap(next_sorted, to_insert);
-
-          } else {
+          }
+          else
+          {
 
             // stop if in correct position
             break;
@@ -222,12 +233,8 @@ shell_sort(Iter begin,
     and using O(lg(N)) comparisons and O(1) memory
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE RAJA_INLINE
-void
-heapify(Iter begin,
-        Iter root,
-        Iter end,
-        Compare comp)
+RAJA_HOST_DEVICE RAJA_INLINE void
+heapify(Iter begin, Iter root, Iter end, Compare comp)
 {
   using RAJA::safe_iter_swap;
 
@@ -235,24 +242,28 @@ heapify(Iter begin,
 
   // heapify the root node into place
   // until this is a max heap again
-  for (auto i = root - begin; 2*i+1 < N; i = root - begin) {
+  for (auto i = root - begin; 2 * i + 1 < N; i = root - begin)
+  {
 
     // find the max item amongst the root, left child, and right child
     Iter maxit = root;
 
     // left child
-    Iter child = begin + 2*i+1;
-    if (comp(*maxit, *child)) {
+    Iter child = begin + 2 * i + 1;
+    if (comp(*maxit, *child))
+    {
       maxit = child;
     }
 
     // right child
     ++child;
-    if (child != end && comp(*maxit, *child)) {
+    if (child != end && comp(*maxit, *child))
+    {
       maxit = child;
     }
 
-    if (maxit == root) {
+    if (maxit == root)
+    {
       // root is the max, done
       break;
     }
@@ -269,24 +280,22 @@ heapify(Iter begin,
     and using O(N*lg(N)) comparisons and O(1) memory
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE inline
-void
-heap_sort(Iter begin,
-          Iter end,
-          Compare comp)
+RAJA_HOST_DEVICE inline void heap_sort(Iter begin, Iter end, Compare comp)
 {
   using RAJA::safe_iter_swap;
 
   auto N = end - begin;
 
-  if (N < 2) {
+  if (N < 2)
+  {
     // already sorted
     return;
   }
 
   // make range into a max heap by
   // going through nodes with children one-by-one in reverse order
-  for (Iter root = begin + (N-1)/2; root != begin; --root) {
+  for (Iter root = begin + (N - 1) / 2; root != begin; --root)
+  {
     // heapify a sub-heap
     heapify(begin, root, end, comp);
   }
@@ -294,7 +303,8 @@ heap_sort(Iter begin,
   heapify(begin, begin, end, comp);
 
   // remove one element from max heap repeatedly until sorted
-  for (--end; begin != end; --end) {
+  for (--end; begin != end; --end)
+  {
 
     // swap max element into sorted position at end of heap
     safe_iter_swap(begin, end);
@@ -325,12 +335,8 @@ struct intro_sort_insertion_sort_cutoff
     and using O(N*lg(N)) comparisons and O(lg(N)) memory, with limited depth.
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE inline
-void
-intro_sort_depth(Iter begin,
-                 Iter end,
-                 Compare comp,
-                 unsigned depth)
+RAJA_HOST_DEVICE inline void
+intro_sort_depth(Iter begin, Iter end, Compare comp, unsigned depth)
 {
   using RAJA::safe_iter_swap;
   using diff_type = ::RAJA::detail::IterDiff<Iter>;
@@ -341,57 +347,56 @@ intro_sort_depth(Iter begin,
   constexpr diff_type insertion_sort_cutoff =
       static_cast<diff_type>(intro_sort_insertion_sort_cutoff::get());
 
-  if (N < 2) {
+  if (N < 2)
+  {
 
     // already sorted
-
-  } else if (N < insertion_sort_cutoff) {
+  }
+  else if (N < insertion_sort_cutoff)
+  {
 
     // use insertion sort for small inputs
     detail::insertion_sort(begin, end, comp);
-
-  } else if (depth == 0) {
+  }
+  else if (depth == 0)
+  {
 
     // use heap sort if recurse too deep
     detail::heap_sort(begin, end, comp);
-
-  } else {
+  }
+  else
+  {
 
     // use quick sort
     // choose pivot with median of 3 (N >= insertion_sort_cutoff)
-    Iter mid = begin + N/2;
-    Iter last = end-1;
-    Iter pivot = comp(*begin, *mid)
-                    ? ( comp(*mid, *last)
-                           ? mid
-                           : ( comp(*begin, *last)
-                                  ? last
-                                  : begin ) )
-                    : ( comp(*mid, *last)
-                           ? ( comp(*begin, *last)
-                                  ? begin
-                                  : last )
-                           : mid );
+    Iter mid  = begin + N / 2;
+    Iter last = end - 1;
+    Iter pivot =
+        comp(*begin, *mid)
+            ? (comp(*mid, *last) ? mid : (comp(*begin, *last) ? last : begin))
+            : (comp(*mid, *last) ? (comp(*begin, *last) ? begin : last) : mid);
 
     // swap pivot to last
-    if (pivot != last) {
+    if (pivot != last)
+    {
       safe_iter_swap(pivot, last);
       pivot = last;
     }
 
     // partition
-    mid = partition(begin, last, [&](Iter it){ return comp(*it, *pivot); });
+    mid = partition(begin, last, [&](Iter it) { return comp(*it, *pivot); });
 
     // swap pivot to sorted position
-    if (mid != pivot) {
+    if (mid != pivot)
+    {
       safe_iter_swap(mid, pivot);
       pivot = mid;
     }
 
     // recurse to sort first and second parts, ignoring already sorted pivot
     // by construction pivot is always in the range [begin, last]
-    detail::intro_sort_depth(begin, pivot, comp, depth-1);
-    detail::intro_sort_depth(RAJA::next(pivot), end, comp, depth-1);
+    detail::intro_sort_depth(begin, pivot, comp, depth - 1);
+    detail::intro_sort_depth(RAJA::next(pivot), end, comp, depth - 1);
   }
 }
 
@@ -400,20 +405,18 @@ intro_sort_depth(Iter begin,
     and using O(N*lg(N)) comparisons and O(lg(N)) memory
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE inline
-void
-intro_sort(Iter begin,
-           Iter end,
-           Compare comp)
+RAJA_HOST_DEVICE inline void intro_sort(Iter begin, Iter end, Compare comp)
 {
   auto N = end - begin;
 
   // set max depth to 2*lg(N)
-  unsigned max_depth = 2*RAJA::log2(N);
+  unsigned max_depth = 2 * RAJA::log2(N);
 
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-  // limit max_depth statically in device code to allow compiler to remove recursion
-  if (max_depth > detail::intro_sort_device_max_depth::get()) {
+  // limit max_depth statically in device code to allow compiler to remove
+  // recursion
+  if (max_depth > detail::intro_sort_device_max_depth::get())
+  {
     max_depth = detail::intro_sort_device_max_depth::get();
   }
 #endif
@@ -426,25 +429,20 @@ intro_sort(Iter begin,
     with local range/2 copy
 */
 template <typename Iter, typename Compare>
-void
-RAJA_INLINE
-inplace_merge(  Iter first,
-                Iter middle,
-                Iter last,
-                Compare comp  )
+void RAJA_INLINE inplace_merge(Iter first, Iter middle, Iter last, Compare comp)
 {
-  using diff_type = RAJA::detail::IterDiff<Iter>;
+  using diff_type  = RAJA::detail::IterDiff<Iter>;
   using value_type = RAJA::detail::IterVal<Iter>;
 
   diff_type copylen = middle - first;
 
-  if ( first == middle || middle == last )
+  if (first == middle || middle == last)
   {
     // at least one side empty, already sorted
     return;
   }
 
-  if ( !comp(*middle, *(middle-1)) )
+  if (!comp(*middle, *(middle - 1)))
   {
     // everything already in order, done
     return;
@@ -455,37 +453,39 @@ inplace_merge(  Iter first,
   buf_deleter_type buf_deleter;
 
   std::unique_ptr<value_type, buf_deleter_type&> copy_buf(
-      RAJA::allocate_aligned_type<value_type>( RAJA::DATA_ALIGN, copylen * sizeof(value_type) ),
+      RAJA::allocate_aligned_type<value_type>(RAJA::DATA_ALIGN,
+                                              copylen * sizeof(value_type)),
       buf_deleter);
 
   value_type* copyarr = copy_buf.get();
 
   // check memory allocation worked
-  if (copyarr == nullptr) {
-    RAJA_ABORT_OR_THROW( "inplace_merge temporary memory allocation failed" );
+  if (copyarr == nullptr)
+  {
+    RAJA_ABORT_OR_THROW("inplace_merge temporary memory allocation failed");
   }
 
   // move construct input into buffer storage
   // use buf_deleter.size as index to keep track of objects constructed
-  for ( diff_type& cc = buf_deleter.size; cc < copylen; ++cc )
+  for (diff_type& cc = buf_deleter.size; cc < copylen; ++cc)
   {
-    new(&copyarr[cc]) value_type(std::move(first[cc]));
+    new (&copyarr[cc]) value_type(std::move(first[cc]));
   }
 
   // merge
-  for ( diff_type cur = 0; cur < copylen; )
+  for (diff_type cur = 0; cur < copylen;)
   {
-    if ( middle >= last ) // moved all second half, put copy into remainder
+    if (middle >= last)  // moved all second half, put copy into remainder
     {
-      std::move( copyarr+cur, copyarr+copylen, first );
+      std::move(copyarr + cur, copyarr + copylen, first);
       break;
     }
-    else if ( first == middle ) // everything prior to middle is sorted, done
+    else if (first == middle)  // everything prior to middle is sorted, done
     {
       break;
     }
 
-    if ( comp(*middle, copyarr[cur]) )
+    if (comp(*middle, copyarr[cur]))
     {
       *first = std::move(*middle);
       ++middle;
@@ -505,47 +505,46 @@ inplace_merge(  Iter first,
     while copies are outside, somewhat follows STL API
 */
 template <typename Iter1, typename Iter2, typename OutIter, typename Compare>
-//constexpr OutIter // <-- std:: return value
-void
-RAJA_INLINE
-merge_like_std( Iter1 first1,
-                Iter1 last1,
-                Iter2 first2,
-                Iter2 last2,
-                OutIter d_first,  // using this as direct access to result
-                Compare comp)
+// constexpr OutIter // <-- std:: return value
+void RAJA_INLINE
+merge_like_std(Iter1 first1,
+               Iter1 last1,
+               Iter2 first2,
+               Iter2 last2,
+               OutIter d_first,  // using this as direct access to result
+               Compare comp)
 {
   using ::RAJA::safe_iter_swap;
 
-  if ( first1 == last2 - 1 )  // should never need to do this
+  if (first1 == last2 - 1)  // should never need to do this
   {
     return;
   }
 
-  if ( (last2 - first1) == 2 ) // only 2 elements, simple swap
+  if ((last2 - first1) == 2)  // only 2 elements, simple swap
   {
-    if ( !comp(*d_first, *(d_first+1)) )
+    if (!comp(*d_first, *(d_first + 1)))
     {
-      safe_iter_swap( d_first, d_first+1 );
+      safe_iter_swap(d_first, d_first + 1);
     }
     return;
   }
 
-  while ( first1 < last1 || first2 < last2 )
+  while (first1 < last1 || first2 < last2)
   {
-    if ( first1 >= last1 ) // first half done
+    if (first1 >= last1)  // first half done
     {
       *d_first = std::move(*first2);
       ++first2;
     }
-    else if ( first2 >= last2 )  // second half done
+    else if (first2 >= last2)  // second half done
     {
       *d_first = std::move(*first1);
       ++first1;
     }
     else  // neither half done
     {
-      if ( comp( *first2, *first1 ) )
+      if (comp(*first2, *first1))
       {
         *d_first = std::move(*first2);
         ++first2;
@@ -568,34 +567,30 @@ merge_like_std( Iter1 first1,
     and using O(N*lg(N)) comparisons and O(N) memory
 */
 template <typename Iter, typename Compare>
-RAJA_INLINE
-void
-merge_sort(Iter begin,
-           Iter end,
-           Compare comp)
+RAJA_INLINE void merge_sort(Iter begin, Iter end, Compare comp)
 {
-  using diff_type = RAJA::detail::IterDiff<Iter>;
+  using diff_type  = RAJA::detail::IterDiff<Iter>;
   using value_type = RAJA::detail::IterVal<Iter>;
 
   // iterative mergesort (bottom up) for future parallelism
 
   // min helper
-  auto minlam = [] (diff_type a, diff_type b) {return (a < b) ? a : b;};
+  auto minlam = [](diff_type a, diff_type b) { return (a < b) ? a : b; };
 
   // insertion sort for sizes <= 16
-  diff_type len = end - begin;
+  diff_type len                                    = end - begin;
   static constexpr diff_type insertion_sort_cutoff = 16;
-  if ( len <= insertion_sort_cutoff && len > 0 )
+  if (len <= insertion_sort_cutoff && len > 0)
   {
-    detail::insertion_sort( begin, end, comp );
+    detail::insertion_sort(begin, end, comp);
   }
   else
   {
     // insertion sort on 16-element chunks, then merge
-    for ( diff_type start = 0; start < len; start += insertion_sort_cutoff )
+    for (diff_type start = 0; start < len; start += insertion_sort_cutoff)
     {
-      diff_type lastchunk = minlam( insertion_sort_cutoff, len - start );
-      detail::insertion_sort( begin + start, begin + start + lastchunk, comp );
+      diff_type lastchunk = minlam(insertion_sort_cutoff, len - start);
+      detail::insertion_sort(begin + start, begin + start + lastchunk, comp);
     }
 
     // merge using extra storage
@@ -605,74 +600,86 @@ merge_sort(Iter begin,
     buf_deleter_type buf_deleter;
 
     std::unique_ptr<value_type, buf_deleter_type&> copy_buf(
-        RAJA::allocate_aligned_type<value_type>( RAJA::DATA_ALIGN, len * sizeof(value_type) ),
+        RAJA::allocate_aligned_type<value_type>(RAJA::DATA_ALIGN,
+                                                len * sizeof(value_type)),
         buf_deleter);
 
     value_type* copyarr = copy_buf.get();
 
     // check memory allocation worked
-    if (copyarr == nullptr) {
-      RAJA_ABORT_OR_THROW( "merge_sort temporary memory allocation failed" );
+    if (copyarr == nullptr)
+    {
+      RAJA_ABORT_OR_THROW("merge_sort temporary memory allocation failed");
     }
 
     // move construct input into buffer storage
     // use buf_deleter.size as index to keep track of objects constructed
-    for ( diff_type& cc = buf_deleter.size; cc < len; ++cc )
+    for (diff_type& cc = buf_deleter.size; cc < len; ++cc)
     {
-      new(&copyarr[cc]) value_type(std::move(begin[cc]));
+      new (&copyarr[cc]) value_type(std::move(begin[cc]));
     }
 
     bool copyvalid = true;
-    //for ( diff_type midpoint = 1; midpoint < len; midpoint *= 2 )  // O(log n) loop
-    for ( diff_type midpoint = 16; midpoint < len; midpoint *= 2 )  // O(log n) loop
+    // for ( diff_type midpoint = 1; midpoint < len; midpoint *= 2 )  // O(log
+    // n) loop
+    for (diff_type midpoint = 16; midpoint < len;
+         midpoint *= 2)  // O(log n) loop
     {
-      for ( diff_type start = 0; start < len; start += midpoint * 2 )  // O(n) merging loop (can be parallelized)
+      for (diff_type start = 0; start < len;
+           start += midpoint * 2)  // O(n) merging loop (can be parallelized)
       {
-        diff_type finish = minlam( start + midpoint * 2, len );
-        if ( finish > len )
+        diff_type finish = minlam(start + midpoint * 2, len);
+        if (finish > len)
         {
-          RAJA_ABORT_OR_THROW( "merge_sort invalid finish point" );  // sanity check
+          RAJA_ABORT_OR_THROW(
+              "merge_sort invalid finish point");  // sanity check
         }
 
-        if ( start + midpoint >= len )
+        if (start + midpoint >= len)
         {
           // copy sorted remainder over
-          if ( copyvalid )
+          if (copyvalid)
           {
-            std::move( copyarr + start, copyarr + finish, begin + start );
+            std::move(copyarr + start, copyarr + finish, begin + start);
           }
           else
           {
-            std::move( begin + start, begin + finish, copyarr + start );
+            std::move(begin + start, begin + finish, copyarr + start);
           }
           break;  // skip merge if no second half exists
         }
 
-        if ( copyvalid )  // switch arrays per level of merging to avoid copying back to copyarr
+        if (copyvalid)  // switch arrays per level of merging to avoid copying
+                        // back to copyarr
         {
-          detail::merge_like_std( copyarr + start, copyarr + start + midpoint, copyarr + start + midpoint, copyarr + finish, begin + start, comp );
+          detail::merge_like_std(copyarr + start, copyarr + start + midpoint,
+                                 copyarr + start + midpoint, copyarr + finish,
+                                 begin + start, comp);
         }
         else
         {
-          detail::merge_like_std( begin + start, begin + start + midpoint, begin + start + midpoint, begin + finish, copyarr + start, comp );
+          detail::merge_like_std(begin + start, begin + start + midpoint,
+                                 begin + start + midpoint, begin + finish,
+                                 copyarr + start, comp);
         }
       }
 
-      copyvalid = !copyvalid; // switch arrays per level of merging to avoid copying back to copyarr
+      copyvalid = !copyvalid;  // switch arrays per level of merging to avoid
+                               // copying back to copyarr
     }
 
     // update copy if necessary
-    if ( copyvalid )
+    if (copyvalid)
     {
-      std::move( copyarr, copyarr + len, begin );
+      std::move(copyarr, copyarr + len, begin);
     }
   }
-  //else
+  // else
   //{
-      // Possible TBD: in-place mergesort
-      // Would shift (like insertion sort) when performing merge.
-      // PRO - Can use on GPU, O(1) storage required.
-      // CON - Shifting would cause slowdown O(n^2 log n).
+  //  Possible TBD: in-place mergesort
+  //  Would shift (like insertion sort) when performing merge.
+  //  PRO - Can use on GPU, O(1) storage required.
+  //  CON - Shifting would cause slowdown O(n^2 log n).
   //}
 }
 
@@ -684,10 +691,9 @@ merge_sort(Iter begin,
 */
 template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
-RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if<type_traits::is_range<Container>>
-insertion_sort(Container&& c,
-               Compare comp = Compare{})
+RAJA_HOST_DEVICE
+    RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
+    insertion_sort(Container&& c, Compare comp = Compare {})
 {
   using std::begin;
   using std::end;
@@ -700,9 +706,11 @@ insertion_sort(Container&& c,
   auto begin_it = begin(c);
   auto end_it   = end(c);
 
-  if (begin_it != end_it) {
+  if (begin_it != end_it)
+  {
     auto next = begin_it;
-    if (++next != end_it) {
+    if (++next != end_it)
+    {
       detail::insertion_sort(begin_it, end_it, comp);
     }
   }
@@ -714,10 +722,9 @@ insertion_sort(Container&& c,
 */
 template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
-RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if<type_traits::is_range<Container>>
-shell_sort(Container&& c,
-           Compare comp = Compare{})
+RAJA_HOST_DEVICE
+    RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
+    shell_sort(Container&& c, Compare comp = Compare {})
 {
   using std::begin;
   using std::end;
@@ -730,9 +737,11 @@ shell_sort(Container&& c,
   auto begin_it = begin(c);
   auto end_it   = end(c);
 
-  if (begin_it != end_it) {
+  if (begin_it != end_it)
+  {
     auto next = begin_it;
-    if (++next != end_it) {
+    if (++next != end_it)
+    {
       detail::shell_sort(begin_it, end_it, comp);
     }
   }
@@ -744,10 +753,9 @@ shell_sort(Container&& c,
 */
 template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
-RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if<type_traits::is_range<Container>>
-heap_sort(Container&& c,
-          Compare comp = Compare{})
+RAJA_HOST_DEVICE
+    RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
+    heap_sort(Container&& c, Compare comp = Compare {})
 {
   using std::begin;
   using std::end;
@@ -760,9 +768,11 @@ heap_sort(Container&& c,
   auto begin_it = begin(c);
   auto end_it   = end(c);
 
-  if (begin_it != end_it) {
+  if (begin_it != end_it)
+  {
     auto next = begin_it;
-    if (++next != end_it) {
+    if (++next != end_it)
+    {
       detail::heap_sort(begin_it, end_it, comp);
     }
   }
@@ -774,10 +784,9 @@ heap_sort(Container&& c,
 */
 template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
-RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if<type_traits::is_range<Container>>
-intro_sort(Container&& c,
-           Compare comp = Compare{})
+RAJA_HOST_DEVICE
+    RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
+    intro_sort(Container&& c, Compare comp = Compare {})
 {
   using std::begin;
   using std::end;
@@ -790,9 +799,11 @@ intro_sort(Container&& c,
   auto begin_it = begin(c);
   auto end_it   = end(c);
 
-  if (begin_it != end_it) {
+  if (begin_it != end_it)
+  {
     auto next = begin_it;
-    if (++next != end_it) {
+    if (++next != end_it)
+    {
       detail::intro_sort(begin_it, end_it, comp);
     }
   }
@@ -804,10 +815,8 @@ intro_sort(Container&& c,
 */
 template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
-RAJA_INLINE
-concepts::enable_if<type_traits::is_range<Container>>
-merge_sort(Container&& c,
-           Compare comp = Compare{})
+RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
+merge_sort(Container&& c, Compare comp = Compare {})
 {
   using std::begin;
   using std::end;
@@ -820,9 +829,11 @@ merge_sort(Container&& c,
   auto begin_it = begin(c);
   auto end_it   = end(c);
 
-  if (begin_it != end_it) {
+  if (begin_it != end_it)
+  {
     auto next = begin_it;
-    if (++next != end_it) {
+    if (++next != end_it)
+    {
       detail::merge_sort(begin_it, end_it, comp);
     }
   }
diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp
index 310217bde5..b38e82e45b 100644
--- a/include/RAJA/util/types.hpp
+++ b/include/RAJA/util/types.hpp
@@ -41,7 +41,7 @@ namespace RAJA
 ///
 enum named_usage : int
 {
-  ignored = -1,
+  ignored     = -1,
   unspecified = 0
 };
 
@@ -70,13 +70,19 @@ enum struct kernel_sync_requirement : int
 namespace iteration_mapping
 {
 
-struct DirectBase {};
-struct LoopBase {};
-struct ContiguousLoopBase : LoopBase {};
-struct StridedLoopBase : LoopBase {};
-struct UnsizedLoopBase {};
-struct SizedLoopBase {};
-template < size_t t_max_iterations >
+struct DirectBase
+{};
+struct LoopBase
+{};
+struct ContiguousLoopBase : LoopBase
+{};
+struct StridedLoopBase : LoopBase
+{};
+struct UnsizedLoopBase
+{};
+struct SizedLoopBase
+{};
+template <size_t t_max_iterations>
 struct SizedLoopSpecifyingBase : SizedLoopBase
 {
   static constexpr size_t max_iterations = t_max_iterations;
@@ -103,7 +109,8 @@ struct SizedLoopSpecifyingBase : SizedLoopBase
 ///   // 3 -> {3}
 ///   // 4 -> {}
 ///
-struct Direct : DirectBase {};
+struct Direct : DirectBase
+{};
 
 ///
 /// Contiguousloop assumes the loop has fewer iterations than indices and
@@ -130,10 +137,13 @@ struct Direct : DirectBase {};
 ///   // 1 -> {3, 4, 5}
 ///   // 2 -> {6, 7}
 ///
-template < size_t max_iterations >
-struct Contiguousloop : ContiguousLoopBase,
-    std::conditional_t<(max_iterations != named_usage::unspecified),
-                       SizedLoopSpecifyingBase<max_iterations>, UnsizedLoopBase> {};
+template <size_t max_iterations>
+struct Contiguousloop
+    : ContiguousLoopBase,
+      std::conditional_t<(max_iterations != named_usage::unspecified),
+                         SizedLoopSpecifyingBase<max_iterations>,
+                         UnsizedLoopBase>
+{};
 
 ///
 /// StridedLoop assumes the loop has fewer iterations than indices and
@@ -160,18 +170,25 @@ struct Contiguousloop : ContiguousLoopBase,
 ///   // 1 -> {1, 4, 7}
 ///   // 2 -> {2, 5}
 ///
-template < size_t max_iterations >
-struct StridedLoop : StridedLoopBase,
-    std::conditional_t<(max_iterations != named_usage::unspecified),
-                       SizedLoopSpecifyingBase<max_iterations>, UnsizedLoopBase> {};
+template <size_t max_iterations>
+struct StridedLoop
+    : StridedLoopBase,
+      std::conditional_t<(max_iterations != named_usage::unspecified),
+                         SizedLoopSpecifyingBase<max_iterations>,
+                         UnsizedLoopBase>
+{};
 
-} // namespace iteration_mapping
+}  // namespace iteration_mapping
 
 ///
 /// Enumeration used to indicate whether ListSegment object owns data
 /// representing its indices.
 ///
-enum IndexOwnership { Unowned, Owned };
+enum IndexOwnership
+{
+  Unowned,
+  Owned
+};
 
 ///
 /// Type use for all loop indexing in RAJA constructs.
@@ -189,8 +206,8 @@ const int UndefinedValue = -9999999;
 /// Template list of sizes
 ///
 template <Index_type... Sizes>
-struct SizeList {
-};
+struct SizeList
+{};
 
 
 ///
@@ -203,15 +220,15 @@ struct Fraction
 
   using inverse = Fraction<int_t, denominator, numerator>;
 
-  template < typename new_int_t >
-  using rebind = Fraction<new_int_t, new_int_t(numerator), new_int_t(denominator)>;
+  template <typename new_int_t>
+  using rebind =
+      Fraction<new_int_t, new_int_t(numerator), new_int_t(denominator)>;
 
   static constexpr int_t multiply(int_t val) noexcept
   {
     return (val / denominator) * numerator +
            (val % denominator) * numerator / denominator;
   }
-
 };
 
 
@@ -254,7 +271,8 @@ using Complex_type = std::complex<Real_type>;
 // alignment attribute supported for versions > 12
 //
 #if __ICC >= 1300
-using TDRAReal_ptr = Real_type* RAJA_RESTRICT __attribute__((align_value(RAJA::DATA_ALIGN)));
+using TDRAReal_ptr =
+    Real_type* RAJA_RESTRICT __attribute__((align_value(RAJA::DATA_ALIGN)));
 
 using const_TDRAReal_ptr = const TDRAReal_ptr;
 #endif
@@ -262,7 +280,8 @@ using const_TDRAReal_ptr = const TDRAReal_ptr;
 #elif defined(RAJA_COMPILER_GNU)
 
 #elif defined(RAJA_COMPILER_CLANG)
-using TDRAReal_ptr = Real_type* RAJA_RESTRICT __attribute__((aligned(RAJA::DATA_ALIGN)));
+using TDRAReal_ptr =
+    Real_type* RAJA_RESTRICT __attribute__((aligned(RAJA::DATA_ALIGN)));
 
 using const_TDRAReal_ptr = const TDRAReal_ptr;
 
@@ -814,51 +833,51 @@ class RestrictComplexPtr
  ******************************************************************************
  */
 #if defined(RAJA_USE_BARE_PTR)
-using Real_ptr = Real_type*;
+using Real_ptr       = Real_type*;
 using const_Real_ptr = const Real_type*;
 
 #if defined(RAJA_USE_COMPLEX)
-using Complex_ptr = Complex_type*;
+using Complex_ptr       = Complex_type*;
 using const_Complex_ptr = const Complex_type*;
 #endif
 
-using UnalignedReal_ptr = Real_type*;
+using UnalignedReal_ptr       = Real_type*;
 using const_UnalignedReal_ptr = const Real_type*;
 
 #elif defined(RAJA_USE_RESTRICT_PTR)
-using Real_ptr = Real_type* RAJA_RESTRICT;
+using Real_ptr       = Real_type* RAJA_RESTRICT;
 using const_Real_ptr = const Real_type* RAJA_RESTRICT;
 
 #if defined(RAJA_USE_COMPLEX)
-using Complex_ptr = Complex_type* RAJA_RESTRICT;
+using Complex_ptr       = Complex_type* RAJA_RESTRICT;
 using const_Complex_ptr = const Complex_type* RAJA_RESTRICT;
 #endif
 
-using UnalignedReal_ptr = Real_type* RAJA_RESTRICT;
+using UnalignedReal_ptr       = Real_type* RAJA_RESTRICT;
 using const_UnalignedReal_ptr = const Real_type* RAJA_RESTRICT;
 
 #elif defined(RAJA_USE_RESTRICT_ALIGNED_PTR)
-using Real_ptr = TDRAReal_ptr;
+using Real_ptr       = TDRAReal_ptr;
 using const_Real_ptr = const_TDRAReal_ptr;
 
 #if defined(RAJA_USE_COMPLEX)
-using Complex_ptr = Complex_type* RAJA_RESTRICT;
+using Complex_ptr       = Complex_type* RAJA_RESTRICT;
 using const_Complex_ptr = const Complex_type* RAJA_RESTRICT;
 #endif
 
-using UnalignedReal_ptr = Real_type* RAJA_RESTRICT;
+using UnalignedReal_ptr       = Real_type* RAJA_RESTRICT;
 using const_UnalignedReal_ptr = const Real_type* RAJA_RESTRICT;
 
 #elif defined(RAJA_USE_PTR_CLASS)
-using Real_ptr = RestrictAlignedRealPtr;
+using Real_ptr       = RestrictAlignedRealPtr;
 using const_Real_ptr = ConstRestrictAlignedRealPtr;
 
 #if defined(RAJA_USE_COMPLEX)
-using Complex_ptr = RestrictComplexPtr;
+using Complex_ptr       = RestrictComplexPtr;
 using const_Complex_ptr = ConstRestrictComplexPtr;
 #endif
 
-using UnalignedReal_ptr = RestrictRealPtr;
+using UnalignedReal_ptr       = RestrictRealPtr;
 using const_UnalignedReal_ptr = ConstRestrictRealPtr;
 
 #else
@@ -867,20 +886,21 @@ using const_UnalignedReal_ptr = ConstRestrictRealPtr;
 #endif
 
 
-namespace detail {
+namespace detail
+{
 
 /*!
  * \brief Abstracts access to memory using normal memory accesses.
  */
 struct DefaultAccessor
 {
-  template < typename T >
+  template <typename T>
   static RAJA_HOST_DEVICE RAJA_INLINE T get(T* ptr, size_t i)
   {
     return ptr[i];
   }
 
-  template < typename T >
+  template <typename T>
   static RAJA_HOST_DEVICE RAJA_INLINE void set(T* ptr, size_t i, T val)
   {
     ptr[i] = val;
@@ -919,11 +939,11 @@ struct AsIntegerArray
                     sizeof(unsigned short) <= max_integer_type_size) ||
                    sizeof(unsigned char) < min_integer_type_size),
                   unsigned short,
-                  std::conditional_t<
-                      ((alignof(T) >= alignof(unsigned char) &&
-                        sizeof(unsigned char) <= max_integer_type_size)),
-                      unsigned char,
-                      void>>>>>;
+                  std::conditional_t<((alignof(T) >= alignof(unsigned char) &&
+                                       sizeof(unsigned char) <=
+                                           max_integer_type_size)),
+                                     unsigned char,
+                                     void>>>>>;
   static_assert(!std::is_same<integer_type, void>::value,
                 "could not find a compatible integer type");
   static_assert(sizeof(integer_type) >= min_integer_type_size,
@@ -965,28 +985,23 @@ template <typename T>
 struct ScopedAssignment
 {
   ScopedAssignment(T& val, T const& new_val)
-    : m_ref_to_val(val)
-    , m_prev_val(std::move(val))
+      : m_ref_to_val(val), m_prev_val(std::move(val))
   {
     m_ref_to_val = new_val;
   }
 
   ScopedAssignment(T& val, T&& new_val)
-    : m_ref_to_val(val)
-    , m_prev_val(std::move(val))
+      : m_ref_to_val(val), m_prev_val(std::move(val))
   {
     m_ref_to_val = std::move(new_val);
   }
 
-  ScopedAssignment(ScopedAssignment const&) = delete;
-  ScopedAssignment(ScopedAssignment &&) = delete;
+  ScopedAssignment(ScopedAssignment const&)            = delete;
+  ScopedAssignment(ScopedAssignment&&)                 = delete;
   ScopedAssignment& operator=(ScopedAssignment const&) = delete;
-  ScopedAssignment& operator=(ScopedAssignment &&) = delete;
+  ScopedAssignment& operator=(ScopedAssignment&&)      = delete;
 
-  ~ScopedAssignment()
-  {
-    m_ref_to_val = std::move(m_prev_val);
-  }
+  ~ScopedAssignment() { m_ref_to_val = std::move(m_prev_val); }
 
 private:
   T& m_ref_to_val;
diff --git a/include/RAJA/util/zip.hpp b/include/RAJA/util/zip.hpp
index 1beefeb9cc..b639a45226 100644
--- a/include/RAJA/util/zip.hpp
+++ b/include/RAJA/util/zip.hpp
@@ -37,41 +37,39 @@ namespace RAJA
     \brief ZipIterator class for simultaneously iterating over
     multiple iterators. This is not a standards compliant iterator.
 */
-template < typename ... Iters >
+template <typename... Iters>
 struct ZipIterator
 {
-  static_assert(concepts::all_of<type_traits::is_random_access_iterator<Iters>...>::value,
+  static_assert(
+      concepts::all_of<type_traits::is_random_access_iterator<Iters>...>::value,
       "ZipIterator can only contain random access iterators");
   static_assert(sizeof...(Iters) > 1,
-      "ZipIterator must contain one or more iterators");
+                "ZipIterator must contain one or more iterators");
 
-  using value_type = zip_val<typename std::iterator_traits<Iters>::value_type...>;
+  using value_type =
+      zip_val<typename std::iterator_traits<Iters>::value_type...>;
   using difference_type = std::ptrdiff_t;
-  using pointer = void;
+  using pointer         = void;
   using reference = zip_ref<typename std::iterator_traits<Iters>::reference...>;
-  using creference = zip_ref<const typename std::iterator_traits<Iters>::reference...>;
+  using creference =
+      zip_ref<const typename std::iterator_traits<Iters>::reference...>;
   using iterator_category = std::random_access_iterator_tag;
 
-  RAJA_HOST_DEVICE inline ZipIterator()
-    : m_iterators()
-  {
-  }
+  RAJA_HOST_DEVICE inline ZipIterator() : m_iterators() {}
 
-  template < typename... Args,
-             typename = concepts::enable_if<type_traits::convertible_to<Args&&, Iters>...> >
+  template <typename... Args,
+            typename = concepts::enable_if<
+                type_traits::convertible_to<Args&&, Iters>...>>
   RAJA_HOST_DEVICE inline ZipIterator(Args&&... args)
-    : m_iterators(std::forward<Args>(args)...)
-  {
-  }
+      : m_iterators(std::forward<Args>(args)...)
+  {}
 
   RAJA_HOST_DEVICE inline ZipIterator(const ZipIterator& rhs)
-    : m_iterators(rhs.m_iterators)
-  {
-  }
+      : m_iterators(rhs.m_iterators)
+  {}
   RAJA_HOST_DEVICE inline ZipIterator(ZipIterator&& rhs)
-    : m_iterators(std::move(rhs.m_iterators))
-  {
-  }
+      : m_iterators(std::move(rhs.m_iterators))
+  {}
 
   RAJA_HOST_DEVICE inline ZipIterator& operator=(const ZipIterator& rhs)
   {
@@ -97,11 +95,11 @@ struct ZipIterator
   }
   RAJA_HOST_DEVICE inline bool operator>(const ZipIterator& rhs) const
   {
-    return RAJA::get<0>(m_iterators) >  RAJA::get<0>(rhs.m_iterators);
+    return RAJA::get<0>(m_iterators) > RAJA::get<0>(rhs.m_iterators);
   }
   RAJA_HOST_DEVICE inline bool operator<(const ZipIterator& rhs) const
   {
-    return RAJA::get<0>(m_iterators) <  RAJA::get<0>(rhs.m_iterators);
+    return RAJA::get<0>(m_iterators) < RAJA::get<0>(rhs.m_iterators);
   }
   RAJA_HOST_DEVICE inline bool operator>=(const ZipIterator& rhs) const
   {
@@ -114,12 +112,12 @@ struct ZipIterator
 
   RAJA_HOST_DEVICE inline ZipIterator& operator++()
   {
-    detail::zip_for_each(m_iterators, detail::PreInc{});
+    detail::zip_for_each(m_iterators, detail::PreInc {});
     return *this;
   }
   RAJA_HOST_DEVICE inline ZipIterator& operator--()
   {
-    detail::zip_for_each(m_iterators, detail::PreDec{});
+    detail::zip_for_each(m_iterators, detail::PreDec {});
     return *this;
   }
   RAJA_HOST_DEVICE inline ZipIterator operator++(int)
@@ -135,41 +133,38 @@ struct ZipIterator
     return tmp;
   }
 
-  RAJA_HOST_DEVICE inline ZipIterator& operator+=(
-      const difference_type& rhs)
+  RAJA_HOST_DEVICE inline ZipIterator& operator+=(const difference_type& rhs)
   {
-    detail::zip_for_each(m_iterators, detail::PlusEq<difference_type>{rhs});
+    detail::zip_for_each(m_iterators, detail::PlusEq<difference_type> {rhs});
     return *this;
   }
-  RAJA_HOST_DEVICE inline ZipIterator& operator-=(
-      const difference_type& rhs)
+  RAJA_HOST_DEVICE inline ZipIterator& operator-=(const difference_type& rhs)
   {
-    detail::zip_for_each(m_iterators, detail::MinusEq<difference_type>{rhs});
+    detail::zip_for_each(m_iterators, detail::MinusEq<difference_type> {rhs});
     return *this;
   }
 
-  RAJA_HOST_DEVICE inline difference_type operator-(
-      const ZipIterator& rhs) const
+  RAJA_HOST_DEVICE inline difference_type
+  operator-(const ZipIterator& rhs) const
   {
     return RAJA::get<0>(m_iterators) - RAJA::get<0>(rhs.m_iterators);
   }
-  RAJA_HOST_DEVICE inline ZipIterator operator+(
-      const difference_type& rhs) const
+  RAJA_HOST_DEVICE inline ZipIterator
+  operator+(const difference_type& rhs) const
   {
     ZipIterator tmp(*this);
     tmp += rhs;
     return tmp;
   }
-  RAJA_HOST_DEVICE inline ZipIterator operator-(
-      const difference_type& rhs) const
+  RAJA_HOST_DEVICE inline ZipIterator
+  operator-(const difference_type& rhs) const
   {
     ZipIterator tmp(*this);
     tmp -= rhs;
     return tmp;
   }
-  RAJA_HOST_DEVICE friend ZipIterator operator+(
-      difference_type lhs,
-      const ZipIterator& rhs)
+  RAJA_HOST_DEVICE friend ZipIterator operator+(difference_type lhs,
+                                                const ZipIterator& rhs)
   {
     ZipIterator tmp(rhs);
     tmp += lhs;
@@ -178,7 +173,7 @@ struct ZipIterator
 
   RAJA_HOST_DEVICE inline reference operator*() const
   {
-    return deref_helper(camp::make_idx_seq_t<sizeof...(Iters)>{});
+    return deref_helper(camp::make_idx_seq_t<sizeof...(Iters)> {});
   }
   // TODO:: figure out what to do with this
   // RAJA_HOST_DEVICE inline reference operator->() const
@@ -190,15 +185,16 @@ struct ZipIterator
     return *((*this) + rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline void safe_iter_swap(ZipIterator lhs, ZipIterator rhs)
+  RAJA_HOST_DEVICE friend inline void safe_iter_swap(ZipIterator lhs,
+                                                     ZipIterator rhs)
   {
-    detail::zip_for_each(lhs.m_iterators, rhs.m_iterators, detail::IterSwap{});
+    detail::zip_for_each(lhs.m_iterators, rhs.m_iterators, detail::IterSwap {});
   }
 
 private:
   zip_val<camp::decay<Iters>...> m_iterators;
 
-  template < camp::idx_t ... Is >
+  template <camp::idx_t... Is>
   RAJA_HOST_DEVICE inline reference deref_helper(camp::idx_seq<Is...>) const
   {
     return reference(*RAJA::get<Is>(m_iterators)...);
@@ -210,10 +206,8 @@ struct ZipIterator
     \brief Zip multiple iterators together to iterate them simultaneously with
     a single ZipIterator object.
 */
-template < typename... Args >
-RAJA_HOST_DEVICE
-auto zip(Args&&... args)
-  -> ZipIterator<camp::decay<Args>...>
+template <typename... Args>
+RAJA_HOST_DEVICE auto zip(Args&&... args) -> ZipIterator<camp::decay<Args>...>
 {
   return {std::forward<Args>(args)...};
 }
@@ -223,29 +217,28 @@ auto zip(Args&&... args)
     ZipIterator objects.
 */
 template <typename... Args>
-RAJA_HOST_DEVICE RAJA_INLINE
-auto zip_span(Args&&... args)
-  -> Span<ZipIterator<detail::ContainerIter<camp::decay<Args>>...>,
-          typename ZipIterator<detail::ContainerIter<camp::decay<Args>>...>::difference_type>
+RAJA_HOST_DEVICE RAJA_INLINE auto zip_span(Args&&... args)
+    -> Span<ZipIterator<detail::ContainerIter<camp::decay<Args>>...>,
+            typename ZipIterator<
+                detail::ContainerIter<camp::decay<Args>>...>::difference_type>
 {
   using std::begin;
   using std::end;
   return Span<ZipIterator<detail::ContainerIter<camp::decay<Args>>...>,
-              typename ZipIterator<detail::ContainerIter<camp::decay<Args>>...>::difference_type>(
+              typename ZipIterator<detail::ContainerIter<
+                  camp::decay<Args>>...>::difference_type>(
       zip(begin(std::forward<Args>(args))...),
-      zip(  end(std::forward<Args>(args))...));
+      zip(end(std::forward<Args>(args))...));
 }
 
 /*!
     \brief Comparator object that compares the first member
     of tuple like objects.
 */
-template < typename T, typename Compare >
+template <typename T, typename Compare>
 struct CompareFirst
 {
-  RAJA_HOST_DEVICE inline CompareFirst(Compare comp_)
-    : comp(comp_)
-  { }
+  RAJA_HOST_DEVICE inline CompareFirst(Compare comp_) : comp(comp_) {}
 
   RAJA_HOST_DEVICE inline bool operator()(T const& lhs, T const& rhs)
   {
@@ -260,10 +253,8 @@ struct CompareFirst
     \brief Make a comparator to compare first member of tuple
     like objects of type T.
 */
-template < typename T, typename Compare >
-RAJA_HOST_DEVICE
-auto compare_first(Compare comp)
-  -> CompareFirst<T, Compare>
+template <typename T, typename Compare>
+RAJA_HOST_DEVICE auto compare_first(Compare comp) -> CompareFirst<T, Compare>
 {
   return {comp};
 }
diff --git a/include/RAJA/util/zip_tuple.hpp b/include/RAJA/util/zip_tuple.hpp
index d631d4714b..5faecebf7b 100644
--- a/include/RAJA/util/zip_tuple.hpp
+++ b/include/RAJA/util/zip_tuple.hpp
@@ -31,49 +31,61 @@
 namespace RAJA
 {
 
-template < bool is_val, typename ... Ts >
+template <bool is_val, typename... Ts>
 struct zip_tuple;
 
-template < camp::idx_t I, typename ZT >
+template <camp::idx_t I, typename ZT>
 struct zip_tuple_element;
 
-template < camp::idx_t I, bool is_val, typename ... Ts >
+template <camp::idx_t I, bool is_val, typename... Ts>
 struct zip_tuple_element<I, zip_tuple<is_val, Ts...>>
-  : camp::tuple_element<I, typename zip_tuple<is_val, Ts...>::value_type>
-{ };
+    : camp::tuple_element<I, typename zip_tuple<is_val, Ts...>::value_type>
+{};
 
-template < camp::idx_t I, typename ZT >
+template <camp::idx_t I, typename ZT>
 using zip_tuple_element_t = typename zip_tuple_element<I, ZT>::type;
 
 
 // get function declarations for zip_tuple
 // the reference type returned by get depends on the reference type
 // of the zip_tuple that get is called on
-template < camp::idx_t I, bool is_val, typename ... Ts >
-RAJA_HOST_DEVICE constexpr                         RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>> &
-get(zip_tuple<is_val, Ts...>      &  z) noexcept
-{ return           z .template get<I>(); }
-template < camp::idx_t I, bool is_val, typename ... Ts >
-RAJA_HOST_DEVICE constexpr                         RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>> const&
-get(zip_tuple<is_val, Ts...> const&  z) noexcept
-{ return           z .template get<I>(); }
-template < camp::idx_t I, bool is_val, typename ... Ts >
-RAJA_HOST_DEVICE constexpr std::remove_reference_t<RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>> &&
-get(zip_tuple<is_val, Ts...>      && z) noexcept
-{ return std::move(z).template get<I>(); }
-template < camp::idx_t I, bool is_val, typename ... Ts >
-RAJA_HOST_DEVICE constexpr std::remove_reference_t<RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>> const&&
+template <camp::idx_t I, bool is_val, typename... Ts>
+RAJA_HOST_DEVICE constexpr RAJA::zip_tuple_element_t<I,
+                                                     zip_tuple<is_val, Ts...>>&
+get(zip_tuple<is_val, Ts...>& z) noexcept
+{
+  return z.template get<I>();
+}
+template <camp::idx_t I, bool is_val, typename... Ts>
+RAJA_HOST_DEVICE constexpr RAJA::
+    zip_tuple_element_t<I, zip_tuple<is_val, Ts...>> const&
+    get(zip_tuple<is_val, Ts...> const& z) noexcept
+{
+  return z.template get<I>();
+}
+template <camp::idx_t I, bool is_val, typename... Ts>
+RAJA_HOST_DEVICE constexpr std::remove_reference_t<
+    RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>>&&
+get(zip_tuple<is_val, Ts...>&& z) noexcept
+{
+  return std::move(z).template get<I>();
+}
+template <camp::idx_t I, bool is_val, typename... Ts>
+RAJA_HOST_DEVICE constexpr std::remove_reference_t<
+    RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>> const&&
 get(zip_tuple<is_val, Ts...> const&& z) noexcept
-{ return std::move(z).template get<I>(); }
+{
+  return std::move(z).template get<I>();
+}
 
 namespace detail
 {
 
 struct PassThrough
 {
-  template < typename T >
+  template <typename T>
   RAJA_HOST_DEVICE RAJA_INLINE auto operator()(T&& t) const
-    -> decltype(std::forward<T>(t))
+      -> decltype(std::forward<T>(t))
   {
     return std::forward<T>(t);
   }
@@ -81,9 +93,9 @@ struct PassThrough
 
 struct Move
 {
-  template < typename T >
+  template <typename T>
   RAJA_HOST_DEVICE RAJA_INLINE auto operator()(T&& t) const
-    -> decltype(std::move(t))
+      -> decltype(std::move(t))
   {
     return std::move(t);
   }
@@ -91,9 +103,9 @@ struct Move
 
 struct PreInc
 {
-  template< typename Iter >
+  template <typename Iter>
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-    -> decltype(++std::forward<Iter>(iter))
+      -> decltype(++std::forward<Iter>(iter))
   {
     return ++std::forward<Iter>(iter);
   }
@@ -101,33 +113,33 @@ struct PreInc
 
 struct PreDec
 {
-  template< typename Iter >
+  template <typename Iter>
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-    -> decltype(--std::forward<Iter>(iter))
+      -> decltype(--std::forward<Iter>(iter))
   {
     return --std::forward<Iter>(iter);
   }
 };
 
-template < typename difference_type >
+template <typename difference_type>
 struct PlusEq
 {
   const difference_type& rhs;
-  template< typename Iter >
+  template <typename Iter>
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-    -> decltype(std::forward<Iter>(iter) += rhs)
+      -> decltype(std::forward<Iter>(iter) += rhs)
   {
     return std::forward<Iter>(iter) += rhs;
   }
 };
 
-template < typename difference_type >
+template <typename difference_type>
 struct MinusEq
 {
   const difference_type& rhs;
-  template< typename Iter >
+  template <typename Iter>
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-    -> decltype(std::forward<Iter>(iter) -= rhs)
+      -> decltype(std::forward<Iter>(iter) -= rhs)
   {
     return std::forward<Iter>(iter) -= rhs;
   }
@@ -135,9 +147,9 @@ struct MinusEq
 
 struct DeRef
 {
-  template< typename Iter >
+  template <typename Iter>
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-    -> decltype(*std::forward<Iter>(iter))
+      -> decltype(*std::forward<Iter>(iter))
   {
     return *std::forward<Iter>(iter);
   }
@@ -145,7 +157,7 @@ struct DeRef
 
 struct Swap
 {
-  template< typename T0, typename T1 >
+  template <typename T0, typename T1>
   RAJA_HOST_DEVICE inline int operator()(T0&& t0, T1&& t1) const
   {
     using camp::safe_swap;
@@ -156,7 +168,7 @@ struct Swap
 
 struct IterSwap
 {
-  template< typename T0, typename T1 >
+  template <typename T0, typename T1>
   RAJA_HOST_DEVICE inline int operator()(T0&& t0, T1&& t1) const
   {
     using RAJA::safe_iter_swap;
@@ -169,9 +181,9 @@ struct IterSwap
 /*!
     \brief Call f on each member of t (f(t)...).
 */
-template < typename Tuple, typename F, camp::idx_t... Is >
-RAJA_HOST_DEVICE inline
-void zip_for_each_impl(Tuple&& t, F&& f, camp::idx_seq<Is...>)
+template <typename Tuple, typename F, camp::idx_t... Is>
+RAJA_HOST_DEVICE inline void
+zip_for_each_impl(Tuple&& t, F&& f, camp::idx_seq<Is...>)
 {
   camp::sink(std::forward<F>(f)(RAJA::get<Is>(std::forward<Tuple>(t)))...);
 }
@@ -179,51 +191,55 @@ void zip_for_each_impl(Tuple&& t, F&& f, camp::idx_seq<Is...>)
 /*!
     \brief Call f on each member of t0 and t1 (f(t0, t1)...).
 */
-template < typename Tuple0, typename Tuple1, typename F, camp::idx_t... Is >
-RAJA_HOST_DEVICE inline
-void zip_for_each_impl(Tuple0&& t0, Tuple1&& t1, F&& f, camp::idx_seq<Is...>)
+template <typename Tuple0, typename Tuple1, typename F, camp::idx_t... Is>
+RAJA_HOST_DEVICE inline void
+zip_for_each_impl(Tuple0&& t0, Tuple1&& t1, F&& f, camp::idx_seq<Is...>)
 {
-  camp::sink(std::forward<F>(f)(RAJA::get<Is>(std::forward<Tuple0>(t0)), RAJA::get<Is>(std::forward<Tuple1>(t1)))...);
+  camp::sink(std::forward<F>(f)(RAJA::get<Is>(std::forward<Tuple0>(t0)),
+                                RAJA::get<Is>(std::forward<Tuple1>(t1)))...);
 }
 
 /*!
     \brief Call f on each member of t (f(t)...).
 */
-template < typename Tuple, typename F >
-RAJA_HOST_DEVICE inline
-void zip_for_each(Tuple&& t, F&& f)
+template <typename Tuple, typename F>
+RAJA_HOST_DEVICE inline void zip_for_each(Tuple&& t, F&& f)
 {
-  zip_for_each_impl(std::forward<Tuple>(t), std::forward<F>(f), typename camp::decay<Tuple>::IdxSeq{});
+  zip_for_each_impl(std::forward<Tuple>(t), std::forward<F>(f),
+                    typename camp::decay<Tuple>::IdxSeq {});
 }
 
 /*!
     \brief Call f on each member of t0 and t1 (f(t0, t1)...).
 */
-template < typename Tuple0, typename Tuple1, typename F >
-RAJA_HOST_DEVICE inline
-void zip_for_each(Tuple0&& t0, Tuple1&& t1, F&& f)
+template <typename Tuple0, typename Tuple1, typename F>
+RAJA_HOST_DEVICE inline void zip_for_each(Tuple0&& t0, Tuple1&& t1, F&& f)
 {
-  static_assert(std::is_same<typename camp::decay<Tuple0>::IdxSeq, typename camp::decay<Tuple1>::IdxSeq>::value,
-      "Tuple0 and Tuple1 must have the same size");
-  zip_for_each_impl(std::forward<Tuple0>(t0), std::forward<Tuple1>(t1), std::forward<F>(f), typename camp::decay<Tuple0>::IdxSeq{});
+  static_assert(std::is_same<typename camp::decay<Tuple0>::IdxSeq,
+                             typename camp::decay<Tuple1>::IdxSeq>::value,
+                "Tuple0 and Tuple1 must have the same size");
+  zip_for_each_impl(std::forward<Tuple0>(t0), std::forward<Tuple1>(t1),
+                    std::forward<F>(f),
+                    typename camp::decay<Tuple0>::IdxSeq {});
 }
 
-} // end namespace detail
+}  // end namespace detail
 
 /*!
     \brief Tuple used by ZipIterator for storing multiple references and values.
-    Acts like a reference to its members allowing copy/move construction/assignment
-    based on the reference type of the zip_tuple.
+    Acts like a reference to its members allowing copy/move
+   construction/assignment based on the reference type of the zip_tuple.
 */
-template < bool is_val, typename ... Ts >
+template <bool is_val, typename... Ts>
 struct zip_tuple
 {
   using value_type = RAJA::tuple<Ts...>;
 
-  template < typename T >
-  using opp_type = typename std::conditional< is_val,
-        typename std::add_lvalue_reference<T>::type,
-        typename std::remove_reference<T>::type >::type;
+  template <typename T>
+  using opp_type =
+      typename std::conditional<is_val,
+                                typename std::add_lvalue_reference<T>::type,
+                                typename std::remove_reference<T>::type>::type;
 
   // zip_tuple type with opposite is_val
   using opp_tuple = zip_tuple<!is_val, opp_type<Ts>...>;
@@ -232,76 +248,114 @@ struct zip_tuple
   using IdxSeq = camp::make_idx_seq_t<sizeof...(Ts)>;
 
   // constructor from types convertible to Ts
-  template < typename ... Os
-           , typename = concepts::enable_if<type_traits::convertible_to<Os&&, Ts>...> >
+  template <
+      typename... Os,
+      typename = concepts::enable_if<type_traits::convertible_to<Os&&, Ts>...>>
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(Os&&... os)
-    : m_tuple(std::forward<Os>(os)...) { }
+      : m_tuple(std::forward<Os>(os)...)
+  {}
 
   // assignment from types convertible to Ts
-  template < typename ... Os
-           , typename = concepts::enable_if<type_traits::convertible_to<Os&&, typename std::remove_reference<Ts>::type>...> >
+  template <typename... Os,
+            typename = concepts::enable_if<type_traits::convertible_to<
+                Os&&,
+                typename std::remove_reference<Ts>::type>...>>
   zip_tuple& assign(Os&&... os)
-  { return assign_helper(IdxSeq{}, std::forward<Os>(os)...); }
+  {
+    return assign_helper(IdxSeq {}, std::forward<Os>(os)...);
+  }
 
   // copy and move constructors
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple &      o)
-    : zip_tuple(          o , IdxSeq{}) { }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple& o) : zip_tuple(o, IdxSeq {})
+  {}
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple const& o)
-    : zip_tuple(          o , IdxSeq{}) { }
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple &&     o)
-    : zip_tuple(std::move(o), IdxSeq{}) { } // move if is_val, pass-through otherwise
+      : zip_tuple(o, IdxSeq {})
+  {}
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple&& o)
+      : zip_tuple(std::move(o), IdxSeq {})
+  {}  // move if is_val, pass-through otherwise
 
   // copy and move assignment operators
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple &      o)
-  { return assign_helper(          o , IdxSeq{}); }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple& o)
+  {
+    return assign_helper(o, IdxSeq {});
+  }
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple const& o)
-  { return assign_helper(          o , IdxSeq{}); }
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple &&     o)
-  { return assign_helper(std::move(o), IdxSeq{}); }
+  {
+    return assign_helper(o, IdxSeq {});
+  }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple&& o)
+  {
+    return assign_helper(std::move(o), IdxSeq {});
+  }
 
   // copy and move constructors from opp_tuple type zip_tuples
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple &      o)
-    : zip_tuple(          o , IdxSeq{}) { }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple& o) : zip_tuple(o, IdxSeq {})
+  {}
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple const& o)
-    : zip_tuple(          o , IdxSeq{}) { }
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple &&     o)
-    : zip_tuple(std::move(o), IdxSeq{}) { } // move if is_val, pass-through otherwise
+      : zip_tuple(o, IdxSeq {})
+  {}
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple&& o)
+      : zip_tuple(std::move(o), IdxSeq {})
+  {}  // move if is_val, pass-through otherwise
 
   // copy and move assignment operators from opp_tuple type zip_tuples
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple &      o)
-  { return assign_helper(          o , IdxSeq{}); }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple& o)
+  {
+    return assign_helper(o, IdxSeq {});
+  }
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple const& o)
-  { return assign_helper(          o , IdxSeq{}); }
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple &&     o)
-  { return assign_helper(std::move(o), IdxSeq{}); }
+  {
+    return assign_helper(o, IdxSeq {});
+  }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple&& o)
+  {
+    return assign_helper(std::move(o), IdxSeq {});
+  }
 
   // get member functions for zip_tuples
   // the reference type returned by get depends on the reference type
   // of the zip_tuple that get is called on
-  template < camp::idx_t I >
-  RAJA_HOST_DEVICE constexpr                         RAJA::tuple_element_t<I, value_type> & get() & noexcept
-  { return RAJA::get<I>(m_tuple); }
-  template < camp::idx_t I >
-  RAJA_HOST_DEVICE constexpr                         RAJA::tuple_element_t<I, value_type> const& get() const& noexcept
-  { return RAJA::get<I>(m_tuple); }
-  template < camp::idx_t I >
-  RAJA_HOST_DEVICE constexpr std::remove_reference_t<RAJA::tuple_element_t<I, value_type>> && get() && noexcept
-  { return std::move(RAJA::get<I>(m_tuple)); }
-  template < camp::idx_t I >
-  RAJA_HOST_DEVICE constexpr std::remove_reference_t<RAJA::tuple_element_t<I, value_type>> const&& get() const&& noexcept
-  { return std::move(RAJA::get<I>(m_tuple)); }
+  template <camp::idx_t I>
+  RAJA_HOST_DEVICE constexpr RAJA::tuple_element_t<I, value_type>&
+  get() & noexcept
+  {
+    return RAJA::get<I>(m_tuple);
+  }
+  template <camp::idx_t I>
+  RAJA_HOST_DEVICE constexpr RAJA::tuple_element_t<I, value_type> const&
+  get() const& noexcept
+  {
+    return RAJA::get<I>(m_tuple);
+  }
+  template <camp::idx_t I>
+  RAJA_HOST_DEVICE constexpr std::remove_reference_t<
+      RAJA::tuple_element_t<I, value_type>>&&
+  get() && noexcept
+  {
+    return std::move(RAJA::get<I>(m_tuple));
+  }
+  template <camp::idx_t I>
+  RAJA_HOST_DEVICE constexpr std::remove_reference_t<
+      RAJA::tuple_element_t<I, value_type>> const&&
+  get() const&& noexcept
+  {
+    return std::move(RAJA::get<I>(m_tuple));
+  }
 
   // safe_swap that calls swap on each pair in the tuple
-  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs, zip_tuple& rhs)
+  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs,
+                                                     zip_tuple& rhs)
   {
-    detail::zip_for_each(lhs, rhs, detail::Swap{});
+    detail::zip_for_each(lhs, rhs, detail::Swap {});
   }
 
   // safe_swap for swapping zip_tuples with opposite is_val
   // calls swap on each pair in the tuple
-  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs, opp_tuple& rhs)
+  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs,
+                                                     opp_tuple& rhs)
   {
-    detail::zip_for_each(lhs, rhs, detail::Swap{});
+    detail::zip_for_each(lhs, rhs, detail::Swap {});
   }
 
   // allow printing of zip_tuples by printing value_type
@@ -313,67 +367,111 @@ struct zip_tuple
 private:
   // move if is_val is true, otherwise copy in move constructor
   // this allows values to be moved, and references to stay lvalue references
-  using IsValMover = typename std::conditional<is_val, detail::Move, detail::PassThrough>::type;
+  using IsValMover = typename std::
+      conditional<is_val, detail::Move, detail::PassThrough>::type;
 
   value_type m_tuple;
 
   // assignment helper from types convertible to Ts
-  template < typename ... Os, camp::idx_t ... Is >
+  template <typename... Os, camp::idx_t... Is>
   zip_tuple& assign_helper(camp::idx_seq<Is...>, Os&&... os)
-  { camp::sink(get<Is>() = std::forward<Os>(os)...); return *this; }
+  {
+    camp::sink(get<Is>() = std::forward<Os>(os)...);
+    return *this;
+  }
 
   // copy and move constructor helpers
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple &      o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(             o )...) { }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple const& o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(             o )...) { }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple &&     o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(IsValMover{}(o))...) { } // move if is_val, pass-through otherwise
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple& o, camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(o)...)
+  {}
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple const& o,
+                                         camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(o)...)
+  {}
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple&& o, camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(IsValMover {}(o))...)
+  {}  // move if is_val, pass-through otherwise
 
   // copy and move assignment operator helpers
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple &      o, camp::idx_seq<Is...>)
-  { if (this != &o) { camp::sink(get<Is>() = RAJA::get<Is>(          o )...); } return *this; }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple const& o, camp::idx_seq<Is...>)
-  { if (this != &o) { camp::sink(get<Is>() = RAJA::get<Is>(          o )...); } return *this; }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple &&     o, camp::idx_seq<Is...>)
-  { if (this != &o) { camp::sink(get<Is>() = RAJA::get<Is>(std::move(o))...); } return *this; }
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    if (this != &o)
+    {
+      camp::sink(get<Is>() = RAJA::get<Is>(o)...);
+    }
+    return *this;
+  }
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple const& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    if (this != &o)
+    {
+      camp::sink(get<Is>() = RAJA::get<Is>(o)...);
+    }
+    return *this;
+  }
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple&& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    if (this != &o)
+    {
+      camp::sink(get<Is>() = RAJA::get<Is>(std::move(o))...);
+    }
+    return *this;
+  }
 
   // copy and move constructor helpers from opp_tuple type zip_tuples
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple &      o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(             o )...) { }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple const& o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(             o )...) { }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple &&     o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(IsValMover{}(o))...) { } // move if is_val, pass-through otherwise
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple& o, camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(o)...)
+  {}
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple const& o,
+                                         camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(o)...)
+  {}
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple&& o, camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(IsValMover {}(o))...)
+  {}  // move if is_val, pass-through otherwise
 
   // copy and move assignment operator helpers from opp_tuple type zip_tuples
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple &      o, camp::idx_seq<Is...>)
-  { camp::sink(get<Is>() = RAJA::get<Is>(          o )...); return *this; }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple const& o, camp::idx_seq<Is...>)
-  { camp::sink(get<Is>() = RAJA::get<Is>(          o )...); return *this; }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple &&     o, camp::idx_seq<Is...>)
-  { camp::sink(get<Is>() = RAJA::get<Is>(std::move(o))...); return *this; }
-
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    camp::sink(get<Is>() = RAJA::get<Is>(o)...);
+    return *this;
+  }
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple const& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    camp::sink(get<Is>() = RAJA::get<Is>(o)...);
+    return *this;
+  }
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple&& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    camp::sink(get<Is>() = RAJA::get<Is>(std::move(o))...);
+    return *this;
+  }
 };
 
 // alias zip_ref to zip_tuple capable of storing references (!is_val)
-template < typename ... Ts >
+template <typename... Ts>
 using zip_ref = zip_tuple<false, Ts...>;
 
 // alias zip_val to zip_tuple suitable for storing values (is_val)
-template < typename ... Ts >
+template <typename... Ts>
 using zip_val = zip_tuple<true, Ts...>;
 
 }  // end namespace RAJA
diff --git a/src/AlignedRangeIndexSetBuilders.cpp b/src/AlignedRangeIndexSetBuilders.cpp
index d95859d71d..1fc5b37f27 100644
--- a/src/AlignedRangeIndexSetBuilders.cpp
+++ b/src/AlignedRangeIndexSetBuilders.cpp
@@ -51,7 +51,8 @@ void buildIndexSetAligned(
   if (length == 0) return;
 
   /* only transform relatively large */
-  if (length > range_min_length) {
+  if (length > range_min_length)
+  {
     /* build a rindex array from an index array */
     RAJA::Index_type docount = 0;
     RAJA::Index_type inrange = -1;
@@ -60,30 +61,41 @@ void buildIndexSetAligned(
     /* first, gather statistics */
     /****************************/
 
-    RAJA::Index_type scanVal = indices_in[0];
+    RAJA::Index_type scanVal    = indices_in[0];
     RAJA::Index_type sliceCount = 0;
-    for (RAJA::Index_type ii = 1; ii < length; ++ii) {
+    for (RAJA::Index_type ii = 1; ii < length; ++ii)
+    {
       RAJA::Index_type lookAhead = indices_in[ii];
 
-      if (inrange == -1) {
-        if ((lookAhead == scanVal + 1) && ((scanVal % range_align) == 0)) {
+      if (inrange == -1)
+      {
+        if ((lookAhead == scanVal + 1) && ((scanVal % range_align) == 0))
+        {
           inrange = 1;
-        } else {
+        }
+        else
+        {
           inrange = 0;
         }
       }
 
-      if (lookAhead == scanVal + 1) {
-        if ((inrange == 0) && ((scanVal % range_align) == 0)) {
-          if (sliceCount != 0) {
+      if (lookAhead == scanVal + 1)
+      {
+        if ((inrange == 0) && ((scanVal % range_align) == 0))
+        {
+          if (sliceCount != 0)
+          {
             docount += 1 + sliceCount; /* length + singletons */
           }
-          inrange = 1;
+          inrange    = 1;
           sliceCount = 0;
         }
         ++sliceCount; /* account for scanVal */
-      } else {
-        if (inrange == 1) {
+      }
+      else
+      {
+        if (inrange == 1)
+        {
           /* we can tighten this up by schleping any trailing */
           /* sigletons off into the subsequent singleton */
           /* array.  We would then also need to recheck the */
@@ -93,9 +105,11 @@ void buildIndexSetAligned(
           /* a range array */
           ++sliceCount;
           docount += 2; /* length + begin */
-          inrange = 0;
+          inrange    = 0;
           sliceCount = 0;
-        } else {
+        }
+        else
+        {
           ++sliceCount; /* account for scanVal */
         }
       }
@@ -103,22 +117,29 @@ void buildIndexSetAligned(
       scanVal = lookAhead;
     }  // end loop to gather statistics
 
-    if (inrange != -1) {
-      if (inrange) {
+    if (inrange != -1)
+    {
+      if (inrange)
+      {
         ++sliceCount;
         docount += 2; /* length + begin */
-      } else {
+      }
+      else
+      {
         ++sliceCount;
         docount += 1 + sliceCount; /* length + singletons */
       }
-    } else if (scanVal != -1) {
+    }
+    else if (scanVal != -1)
+    {
       ++sliceCount;
       docount += 2;
     }
     ++docount; /* zero length termination */
 
     /* What is the cutoff criteria for generating the rindex array? */
-    if (docount < (length * (range_align - 1)) / range_align) {
+    if (docount < (length * (range_align - 1)) / range_align)
+    {
       /* The rindex array can either contain a pointer into the */
       /* original index array, *or* it can repack the data from the */
       /* original index array.  Benefits of repacking could include */
@@ -132,33 +153,44 @@ void buildIndexSetAligned(
       RAJA::Index_type dobegin;
       inrange = -1;
 
-      scanVal = indices_in[0];
+      scanVal    = indices_in[0];
       sliceCount = 0;
-      dobegin = scanVal;
-      for (RAJA::Index_type ii = 1; ii < length; ++ii) {
+      dobegin    = scanVal;
+      for (RAJA::Index_type ii = 1; ii < length; ++ii)
+      {
         RAJA::Index_type lookAhead = indices_in[ii];
 
-        if (inrange == -1) {
-          if ((lookAhead == scanVal + 1) && ((scanVal % range_align) == 0)) {
+        if (inrange == -1)
+        {
+          if ((lookAhead == scanVal + 1) && ((scanVal % range_align) == 0))
+          {
             inrange = 1;
-          } else {
+          }
+          else
+          {
             inrange = 0;
             dobegin = ii - 1;
           }
         }
-        if (lookAhead == scanVal + 1) {
-          if ((inrange == 0) && ((scanVal % range_align) == 0)) {
-            if (sliceCount != 0) {
-              iset.push_back(ListSegment(&indices_in[dobegin], sliceCount,
-                                          work_res));
+        if (lookAhead == scanVal + 1)
+        {
+          if ((inrange == 0) && ((scanVal % range_align) == 0))
+          {
+            if (sliceCount != 0)
+            {
+              iset.push_back(
+                  ListSegment(&indices_in[dobegin], sliceCount, work_res));
             }
-            inrange = 1;
-            dobegin = scanVal;
+            inrange    = 1;
+            dobegin    = scanVal;
             sliceCount = 0;
           }
           ++sliceCount; /* account for scanVal */
-        } else {
-          if (inrange == 1) {
+        }
+        else
+        {
+          if (inrange == 1)
+          {
             /* we can tighten this up by schleping any trailing */
             /* sigletons off into the subsequent singleton */
             /* array.  We would then also need to recheck the */
@@ -168,10 +200,12 @@ void buildIndexSetAligned(
             /* a range array */
             ++sliceCount;
             iset.push_back(RangeSegment(dobegin, dobegin + sliceCount));
-            inrange = 0;
+            inrange    = 0;
             sliceCount = 0;
-            dobegin = ii;
-          } else {
+            dobegin    = ii;
+          }
+          else
+          {
             ++sliceCount; /* account for scanVal */
           }
         }
@@ -179,22 +213,32 @@ void buildIndexSetAligned(
         scanVal = lookAhead;
       }  // for (RAJA::Index_type ii ...
 
-      if (inrange != -1) {
-        if (inrange) {
+      if (inrange != -1)
+      {
+        if (inrange)
+        {
           ++sliceCount;
           iset.push_back(RangeSegment(dobegin, dobegin + sliceCount));
-        } else {
+        }
+        else
+        {
           ++sliceCount;
-          iset.push_back(ListSegment(&indices_in[dobegin], sliceCount,
-                                      work_res));
+          iset.push_back(
+              ListSegment(&indices_in[dobegin], sliceCount, work_res));
         }
-      } else if (scanVal != -1) {
+      }
+      else if (scanVal != -1)
+      {
         iset.push_back(ListSegment(&scanVal, 1, work_res));
       }
-    } else {  // !(docount < (length*range_align-1))/range_align)
+    }
+    else
+    {  // !(docount < (length*range_align-1))/range_align)
       iset.push_back(ListSegment(indices_in, length, work_res));
     }
-  } else {  // else !(length > range_min_length)
+  }
+  else
+  {  // else !(length > range_min_length)
     iset.push_back(ListSegment(indices_in, length, work_res));
   }
 }
diff --git a/src/DepGraphNode.cpp b/src/DepGraphNode.cpp
index 176d9e855d..df994ce396 100644
--- a/src/DepGraphNode.cpp
+++ b/src/DepGraphNode.cpp
@@ -29,9 +29,11 @@ void DepGraphNode::print(std::ostream& os) const
      << m_semaphore_reload_value << std::endl;
 
   os << "     num dep tasks = " << m_num_dep_tasks;
-  if (m_num_dep_tasks > 0) {
+  if (m_num_dep_tasks > 0)
+  {
     os << " ( ";
-    for (int jj = 0; jj < m_num_dep_tasks; ++jj) {
+    for (int jj = 0; jj < m_num_dep_tasks; ++jj)
+    {
       os << m_dep_task[jj] << "  ";
     }
     os << " )";
diff --git a/src/KokkosPluginLoader.cpp b/src/KokkosPluginLoader.cpp
index fa05e0faf8..d84367dc29 100644
--- a/src/KokkosPluginLoader.cpp
+++ b/src/KokkosPluginLoader.cpp
@@ -15,43 +15,44 @@
 const uint64_t kokkos_interface_version = 20171029;
 
 RAJA_INLINE
-bool
-isSharedObject(const std::string& filename)
+bool isSharedObject(const std::string& filename)
 {
-  return (filename.size() > 3 && !filename.compare(filename.size() - 3, 3, ".so"));
+  return (filename.size() > 3 &&
+          !filename.compare(filename.size() - 3, 3, ".so"));
 }
 
-template<typename function>
-RAJA_INLINE
-void
+template <typename function>
+RAJA_INLINE void
 getFunction(void* plugin, std::vector<function>& functions, const char* fname)
 {
-  #ifndef _WIN32
-  function func = (function) dlsym(plugin, fname);
+#ifndef _WIN32
+  function func = (function)dlsym(plugin, fname);
   if (func)
     functions.push_back(func);
   else
     printf("[KokkosPluginLoader]: dlsym failed: %s\n", dlerror());
-  #else
+#else
   RAJA_UNUSED_ARG(plugin);
   RAJA_UNUSED_ARG(functions);
   RAJA_UNUSED_ARG(fname);
-  #endif
+#endif
 }
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
 KokkosPluginLoader::KokkosPluginLoader()
 {
-  char *env = getenv("KOKKOS_PLUGINS");
+  char* env = getenv("KOKKOS_PLUGINS");
   if (env == nullptr)
   {
     return;
   }
   initDirectory(std::string(env));
 
-  for (auto &func : init_functions)
+  for (auto& func : init_functions)
   {
     func(0, kokkos_interface_version, 0, nullptr);
   }
@@ -59,7 +60,7 @@ KokkosPluginLoader::KokkosPluginLoader()
 
 void KokkosPluginLoader::preLaunch(const RAJA::util::PluginContext& p)
 {
-  for (auto &func : pre_functions)
+  for (auto& func : pre_functions)
   {
     func("", 0, &(p.kID));
   }
@@ -67,7 +68,7 @@ void KokkosPluginLoader::preLaunch(const RAJA::util::PluginContext& p)
 
 void KokkosPluginLoader::postLaunch(const RAJA::util::PluginContext& p)
 {
-  for (auto &func : post_functions)
+  for (auto& func : post_functions)
   {
     func(p.kID);
   }
@@ -75,7 +76,7 @@ void KokkosPluginLoader::postLaunch(const RAJA::util::PluginContext& p)
 
 void KokkosPluginLoader::finalize()
 {
-  for (auto &func : finalize_functions)
+  for (auto& func : finalize_functions)
   {
     func();
   }
@@ -86,10 +87,10 @@ void KokkosPluginLoader::finalize()
 }
 
 // Initialize plugin from a shared object file specified by 'path'.
-void KokkosPluginLoader::initPlugin(const std::string &path)
+void KokkosPluginLoader::initPlugin(const std::string& path)
 {
-  #ifndef _WIN32
-  void *plugin = dlopen(path.c_str(), RTLD_NOW | RTLD_GLOBAL);
+#ifndef _WIN32
+  void* plugin = dlopen(path.c_str(), RTLD_NOW | RTLD_GLOBAL);
   if (!plugin)
   {
     printf("[KokkosPluginLoader]: dlopen failed: %s\n", dlerror());
@@ -98,28 +99,31 @@ void KokkosPluginLoader::initPlugin(const std::string &path)
   // Getting and storing supported kokkos functions.
   getFunction<init_function>(plugin, init_functions, "kokkosp_init_library");
 
-  getFunction<pre_function>(plugin, pre_functions, "kokkosp_begin_parallel_for");
+  getFunction<pre_function>(plugin, pre_functions,
+                            "kokkosp_begin_parallel_for");
 
-  getFunction<post_function>(plugin, post_functions, "kokkosp_end_parallel_for");
+  getFunction<post_function>(plugin, post_functions,
+                             "kokkosp_end_parallel_for");
 
-  getFunction<finalize_function>(plugin, finalize_functions, "kokkosp_finalize_library");
-  #else
+  getFunction<finalize_function>(plugin, finalize_functions,
+                                 "kokkosp_finalize_library");
+#else
   RAJA_UNUSED_ARG(path);
-  #endif
+#endif
 }
 
 // Initialize all plugins in a directory specified by 'path'.
-void KokkosPluginLoader::initDirectory(const std::string &path)
+void KokkosPluginLoader::initDirectory(const std::string& path)
 {
-  #ifndef _WIN32
+#ifndef _WIN32
   if (isSharedObject(path))
   {
     initPlugin(path);
     return;
   }
-  
-  DIR *dir;
-  struct dirent *file;
+
+  DIR* dir;
+  struct dirent* file;
 
   if ((dir = opendir(path.c_str())) != NULL)
   {
@@ -136,14 +140,17 @@ void KokkosPluginLoader::initDirectory(const std::string &path)
   {
     perror("[KokkosPluginLoader]: Could not open plugin directory");
   }
-  #else
+#else
   RAJA_UNUSED_ARG(path);
-  #endif
+#endif
 }
 
 void linkKokkosPluginLoader() {}
 
-} // end namespace util
-} // end namespace RAJA
+}  // end namespace util
+}  // end namespace RAJA
 
-static RAJA::util::PluginRegistry::add<RAJA::util::KokkosPluginLoader> P("KokkosPluginLoader", "Dynamically load plugins ported from the Kokkos library.");
+static RAJA::util::PluginRegistry::add<RAJA::util::KokkosPluginLoader>
+    P("KokkosPluginLoader",
+      "Dynamically load plugins ported from the Kokkos "
+      "library.");
diff --git a/src/LockFreeIndexSetBuilders.cpp b/src/LockFreeIndexSetBuilders.cpp
index f9ef1f51c8..3b5f314138 100644
--- a/src/LockFreeIndexSetBuilders.cpp
+++ b/src/LockFreeIndexSetBuilders.cpp
@@ -38,15 +38,14 @@ namespace RAJA
  ******************************************************************************
  *
  * Generate a lock-free "block" index set (planar division) containing
- * range segments. 
+ * range segments.
  *
  ******************************************************************************
  */
-void buildLockFreeBlockIndexset(
-    RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
-    int fastDim,
-    int midDim,
-    int slowDim)
+void buildLockFreeBlockIndexset(RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
+                                int fastDim,
+                                int midDim,
+                                int slowDim)
 {
   constexpr int PROFITABLE_ENTITY_THRESHOLD_BLOCK = 100;
 
@@ -56,10 +55,13 @@ void buildLockFreeBlockIndexset(
 
   if ((midDim | slowDim) == 0) /* 1d mesh */
   {
-    if (fastDim / PROFITABLE_ENTITY_THRESHOLD_BLOCK <= 1) {
+    if (fastDim / PROFITABLE_ENTITY_THRESHOLD_BLOCK <= 1)
+    {
       // printf("%d %d\n", 0, fastDim) ;
       iset.push_back(RAJA::RangeSegment(0, fastDim));
-    } else {
+    }
+    else
+    {
       /* This just sets up the schedule -- a truly safe */
       /* execution of this schedule would require a check */
       /* for completion of dependent threads before execution. */
@@ -68,22 +70,28 @@ void buildLockFreeBlockIndexset(
       /* profitability ratio is really bad, but for */
       /* now use the brain dead approach. */
       int numSegments = numThreads * 3;
-      for (int lane = 0; lane < 3; ++lane) {
-        for (int i = lane; i < numSegments; i += 3) {
+      for (int lane = 0; lane < 3; ++lane)
+      {
+        for (int i = lane; i < numSegments; i += 3)
+        {
           RAJA::Index_type start = i * fastDim / numSegments;
-          RAJA::Index_type end = (i + 1) * fastDim / numSegments;
+          RAJA::Index_type end   = (i + 1) * fastDim / numSegments;
           // printf("%d %d\n", start, end) ;
           iset.push_back(RAJA::RangeSegment(start, end));
         }
       }
     }
-  } else if (slowDim == 0) /* 2d mesh */
+  }
+  else if (slowDim == 0) /* 2d mesh */
   {
     int rowsPerSegment = midDim / (3 * numThreads);
-    if (rowsPerSegment == 0) {
+    if (rowsPerSegment == 0)
+    {
       // printf("%d %d\n", 0, fastDim*midDim) ;
       iset.push_back(RAJA::RangeSegment(0, fastDim * midDim));
-    } else {
+    }
+    else
+    {
       /* This just sets up the schedule -- a truly safe */
       /* execution of this schedule would require a check */
       /* for completion of dependent threads before execution. */
@@ -91,13 +99,15 @@ void buildLockFreeBlockIndexset(
       /* We might want to force one thread if the */
       /* profitability ratio is really bad, but for */
       /* now use the brain dead approach. */
-      for (int lane = 0; lane < 3; ++lane) {
-        for (int i = 0; i < numThreads; ++i) {
+      for (int lane = 0; lane < 3; ++lane)
+      {
+        for (int i = 0; i < numThreads; ++i)
+        {
           RAJA::Index_type startRow = i * midDim / numThreads;
-          RAJA::Index_type endRow = (i + 1) * midDim / numThreads;
-          RAJA::Index_type start = startRow * fastDim;
-          RAJA::Index_type end = endRow * fastDim;
-          RAJA::Index_type len = end - start;
+          RAJA::Index_type endRow   = (i + 1) * midDim / numThreads;
+          RAJA::Index_type start    = startRow * fastDim;
+          RAJA::Index_type end      = endRow * fastDim;
+          RAJA::Index_type len      = end - start;
           // printf("%d %d\n", start + (lane  )*len/3,
           //                   start + (lane+1)*len/3  ) ;
           iset.push_back(RAJA::RangeSegment(start + (lane)*len / 3,
@@ -105,7 +115,9 @@ void buildLockFreeBlockIndexset(
         }
       }
     }
-  } else { /* 3d mesh */
+  }
+  else
+  { /* 3d mesh */
 
     // this requires dependence graph - commenting out for now
 
@@ -209,14 +221,14 @@ void buildLockFreeColorIndexset(
     RAJA::Index_type* elemPermutation,
     RAJA::Index_type* ielemPermutation)
 {
-  bool done = false;
+  bool done      = false;
   bool* isMarked = new bool[numEntity];
 
-  RAJA::Index_type numWorkset = 0;
+  RAJA::Index_type numWorkset    = 0;
   RAJA::Index_type* worksetDelim = new RAJA::Index_type[numEntity];
 
   RAJA::Index_type worksetSize = 0;
-  RAJA::Index_type* workset = new RAJA::Index_type[numEntity];
+  RAJA::Index_type* workset    = new RAJA::Index_type[numEntity];
 
   RAJA::Index_type* rangeToDomain =
       new RAJA::Index_type[numEntityRange * numRangePerDomain];
@@ -225,12 +237,15 @@ void buildLockFreeColorIndexset(
   memset(rangeToDomainCount, 0, numEntityRange * sizeof(RAJA::Index_type));
 
   /* create an inverse mapping */
-  for (int i = 0; i < numEntity; ++i) {
-    for (int j = 0; j < numRangePerDomain; ++j) {
-      RAJA::Index_type id = domainToRange[i * numRangePerDomain + j];
+  for (int i = 0; i < numEntity; ++i)
+  {
+    for (int j = 0; j < numRangePerDomain; ++j)
+    {
+      RAJA::Index_type id  = domainToRange[i * numRangePerDomain + j];
       RAJA::Index_type idx = id * numRangePerDomain + rangeToDomainCount[id]++;
       if (idx > numEntityRange * numRangePerDomain ||
-          rangeToDomainCount[id] > numRangePerDomain) {
+          rangeToDomainCount[id] > numRangePerDomain)
+      {
         printf("foiled!\n");
         exit(-1);
       }
@@ -238,30 +253,39 @@ void buildLockFreeColorIndexset(
     }
   }
 
-  while (!done) {
+  while (!done)
+  {
     done = true;
 
-    for (int i = 0; i < numEntity; ++i) {
+    for (int i = 0; i < numEntity; ++i)
+    {
       isMarked[i] = false;
     }
 
-    for (int i = 0; i < worksetSize; ++i) {
+    for (int i = 0; i < worksetSize; ++i)
+    {
       isMarked[workset[i]] = true;
     }
 
-    for (int i = 0; i < numEntity; ++i) {
-      if (isMarked[i] == false) {
+    for (int i = 0; i < numEntity; ++i)
+    {
+      if (isMarked[i] == false)
+      {
         done = false;
-        if (worksetSize >= numEntity) {
+        if (worksetSize >= numEntity)
+        {
           printf("foiled!\n");
           exit(-1);
         }
         workset[worksetSize++] = i;
-        for (int j = 0; j < numRangePerDomain; ++j) {
+        for (int j = 0; j < numRangePerDomain; ++j)
+        {
           RAJA::Index_type id = domainToRange[i * numRangePerDomain + j];
-          for (int k = 0; k < rangeToDomainCount[id]; ++k) {
+          for (int k = 0; k < rangeToDomainCount[id]; ++k)
+          {
             RAJA::Index_type idx = rangeToDomain[id * numRangePerDomain + k];
-            if (idx < 0 || idx >= numEntity) {
+            if (idx < 0 || idx >= numEntity)
+            {
               printf("foiled!\n");
               exit(-1);
             }
@@ -270,7 +294,8 @@ void buildLockFreeColorIndexset(
         }
       }
     }
-    if (done == false) {
+    if (done == false)
+    {
       worksetDelim[numWorkset++] = worksetSize;
     }
   }
@@ -278,45 +303,58 @@ void buildLockFreeColorIndexset(
   delete[] rangeToDomainCount;
   delete[] rangeToDomain;
 
-  if (worksetSize != numEntity) {
+  if (worksetSize != numEntity)
+  {
     printf("foiled!!!\n");
     exit(-1);
   }
 
   /* we may want to create a permutation array here */
-  if (elemPermutation != 0l) {
+  if (elemPermutation != 0l)
+  {
     /* send back permutaion array, and corresponding range segments */
 
     memcpy(elemPermutation, &workset[0], numEntity * sizeof(int));
-    if (ielemPermutation != 0l) {
-      for (int i = 0; i < numEntity; ++i) {
+    if (ielemPermutation != 0l)
+    {
+      for (int i = 0; i < numEntity; ++i)
+      {
         ielemPermutation[elemPermutation[i]] = i;
       }
     }
     RAJA::Index_type end = 0;
-    for (int i = 0; i < numWorkset; ++i) {
+    for (int i = 0; i < numWorkset; ++i)
+    {
       RAJA::Index_type begin = end;
-      end = worksetDelim[i];
+      end                    = worksetDelim[i];
       iset.push_back(RAJA::RangeSegment(begin, end));
     }
-  } else {
+  }
+  else
+  {
     RAJA::Index_type end = 0;
-    for (int i = 0; i < numWorkset; ++i) {
+    for (int i = 0; i < numWorkset; ++i)
+    {
       RAJA::Index_type begin = end;
-      end = worksetDelim[i];
-      bool isRange = true;
-      for (int j = begin + 1; j < end; ++j) {
-        if (workset[j - 1] + 1 != workset[j]) {
+      end                    = worksetDelim[i];
+      bool isRange           = true;
+      for (int j = begin + 1; j < end; ++j)
+      {
+        if (workset[j - 1] + 1 != workset[j])
+        {
           isRange = false;
           break;
         }
       }
-      if (isRange) {
+      if (isRange)
+      {
         iset.push_back(
             RAJA::RangeSegment(workset[begin], workset[end - 1] + 1));
-      } else {
-        iset.push_back(RAJA::ListSegment(&workset[begin], end - begin,
-                                         work_res));
+      }
+      else
+      {
+        iset.push_back(
+            RAJA::ListSegment(&workset[begin], end - begin, work_res));
         // printf("segment %d\n", i) ;
         // for (int j=begin; j<end; ++j) {
         //    printf("%d\n", workset[j]) ;
diff --git a/src/MemUtils_SYCL.cpp b/src/MemUtils_SYCL.cpp
index 0b5f1b8be6..c7bdbadb1a 100644
--- a/src/MemUtils_SYCL.cpp
+++ b/src/MemUtils_SYCL.cpp
@@ -49,7 +49,7 @@ syclInfo tl_status;
 #endif
 
 //! State of raja sycl queue synchronization for sycl reducer objects
-std::unordered_map<cl::sycl::queue, bool> g_queue_info_map{
+std::unordered_map<cl::sycl::queue, bool> g_queue_info_map {
     {cl::sycl::queue(), true}};
 
 }  // namespace detail
diff --git a/src/PluginStrategy.cpp b/src/PluginStrategy.cpp
index e39c5718a8..eee0962fc4 100644
--- a/src/PluginStrategy.cpp
+++ b/src/PluginStrategy.cpp
@@ -9,22 +9,24 @@
 
 RAJA_INSTANTIATE_REGISTRY(PluginRegistry);
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
 PluginStrategy::PluginStrategy() = default;
 
-void PluginStrategy::init(const PluginOptions&) { }
+void PluginStrategy::init(const PluginOptions&) {}
 
-void PluginStrategy::preCapture(const PluginContext&) { }
+void PluginStrategy::preCapture(const PluginContext&) {}
 
-void PluginStrategy::postCapture(const PluginContext&) { }
+void PluginStrategy::postCapture(const PluginContext&) {}
 
-void PluginStrategy::preLaunch(const PluginContext&) { }
+void PluginStrategy::preLaunch(const PluginContext&) {}
 
-void PluginStrategy::postLaunch(const PluginContext&) { }
+void PluginStrategy::postLaunch(const PluginContext&) {}
 
-void PluginStrategy::finalize() { }
+void PluginStrategy::finalize() {}
 
-}
-}
+}  // namespace util
+}  // namespace RAJA
diff --git a/src/RuntimePluginLoader.cpp b/src/RuntimePluginLoader.cpp
index 3da10cda8c..94d19699b6 100644
--- a/src/RuntimePluginLoader.cpp
+++ b/src/RuntimePluginLoader.cpp
@@ -13,18 +13,20 @@
 #endif
 
 RAJA_INLINE
-bool
-isSharedObject(const std::string& filename)
+bool isSharedObject(const std::string& filename)
 {
-  return (filename.size() > 3 && !filename.compare(filename.size() - 3, 3, ".so"));
+  return (filename.size() > 3 &&
+          !filename.compare(filename.size() - 3, 3, ".so"));
 }
 
-namespace RAJA {
-namespace util {
-  
+namespace RAJA
+{
+namespace util
+{
+
 RuntimePluginLoader::RuntimePluginLoader()
 {
-  char *env = ::getenv("RAJA_PLUGINS");
+  char* env = ::getenv("RAJA_PLUGINS");
   if (nullptr == env)
   {
     return;
@@ -35,7 +37,7 @@ RuntimePluginLoader::RuntimePluginLoader()
 void RuntimePluginLoader::init(const RAJA::util::PluginOptions& p)
 {
   initDirectory(p.str);
-  for (auto &plugin : plugins)
+  for (auto& plugin : plugins)
   {
     plugin->init(p);
   }
@@ -43,7 +45,7 @@ void RuntimePluginLoader::init(const RAJA::util::PluginOptions& p)
 
 void RuntimePluginLoader::preCapture(const RAJA::util::PluginContext& p)
 {
-  for (auto &plugin : plugins)
+  for (auto& plugin : plugins)
   {
     plugin->preCapture(p);
   }
@@ -51,7 +53,7 @@ void RuntimePluginLoader::preCapture(const RAJA::util::PluginContext& p)
 
 void RuntimePluginLoader::postCapture(const RAJA::util::PluginContext& p)
 {
-  for (auto &plugin : plugins)
+  for (auto& plugin : plugins)
   {
     plugin->postCapture(p);
   }
@@ -59,7 +61,7 @@ void RuntimePluginLoader::postCapture(const RAJA::util::PluginContext& p)
 
 void RuntimePluginLoader::preLaunch(const RAJA::util::PluginContext& p)
 {
-  for (auto &plugin : plugins)
+  for (auto& plugin : plugins)
   {
     plugin->preLaunch(p);
   }
@@ -67,7 +69,7 @@ void RuntimePluginLoader::preLaunch(const RAJA::util::PluginContext& p)
 
 void RuntimePluginLoader::postLaunch(const RAJA::util::PluginContext& p)
 {
-  for (auto &plugin : plugins)
+  for (auto& plugin : plugins)
   {
     plugin->postLaunch(p);
   }
@@ -75,7 +77,7 @@ void RuntimePluginLoader::postLaunch(const RAJA::util::PluginContext& p)
 
 void RuntimePluginLoader::finalize()
 {
-  for (auto &plugin : plugins)
+  for (auto& plugin : plugins)
   {
     plugin->finalize();
   }
@@ -83,42 +85,44 @@ void RuntimePluginLoader::finalize()
 }
 
 // Initialize plugin from a shared object file specified by 'path'.
-void RuntimePluginLoader::initPlugin(const std::string &path)
+void RuntimePluginLoader::initPlugin(const std::string& path)
 {
-  #ifndef _WIN32
-  void *plugin = dlopen(path.c_str(), RTLD_NOW | RTLD_GLOBAL);
+#ifndef _WIN32
+  void* plugin = dlopen(path.c_str(), RTLD_NOW | RTLD_GLOBAL);
   if (!plugin)
   {
     printf("[RuntimePluginLoader]: dlopen failed: %s\n", dlerror());
   }
 
-  RuntimePluginLoader::Parent *(*getPlugin)() = (RuntimePluginLoader::Parent * (*)()) dlsym(plugin, "getPlugin");
+  RuntimePluginLoader::Parent* (*getPlugin)() =
+      (RuntimePluginLoader::Parent * (*)()) dlsym(plugin, "getPlugin");
 
   if (getPlugin)
   {
-    plugins.push_back(std::unique_ptr<RuntimePluginLoader::Parent>(getPlugin()));
+    plugins.push_back(
+        std::unique_ptr<RuntimePluginLoader::Parent>(getPlugin()));
   }
   else
   {
     printf("[RuntimePluginLoader]: dlsym failed: %s\n", dlerror());
   }
-  #else
+#else
   RAJA_UNUSED_ARG(path);
-  #endif
+#endif
 }
 
 // Initialize all plugins in a directory specified by 'path'.
-void RuntimePluginLoader::initDirectory(const std::string &path)
+void RuntimePluginLoader::initDirectory(const std::string& path)
 {
-  #ifndef _WIN32
+#ifndef _WIN32
   if (isSharedObject(path))
   {
     initPlugin(path);
     return;
   }
-  
-  DIR *dir;
-  struct dirent *file;
+
+  DIR* dir;
+  struct dirent* file;
 
   if ((dir = opendir(path.c_str())) != NULL)
   {
@@ -135,14 +139,15 @@ void RuntimePluginLoader::initDirectory(const std::string &path)
   {
     perror("[RuntimePluginLoader]: Could not open plugin directory");
   }
-  #else
+#else
   RAJA_UNUSED_ARG(path);
-  #endif
+#endif
 }
 
 void linkRuntimePluginLoader() {}
 
-} // end namespace util
-} // end namespace RAJA
+}  // end namespace util
+}  // end namespace RAJA
 
-static RAJA::util::PluginRegistry::add<RAJA::util::RuntimePluginLoader> P("RuntimePluginLoader", "Dynamically load RAJA plugins.");
+static RAJA::util::PluginRegistry::add<RAJA::util::RuntimePluginLoader>
+    P("RuntimePluginLoader", "Dynamically load RAJA plugins.");
diff --git a/src/TensorStats.cpp b/src/TensorStats.cpp
index b650b691f9..d34a6c8159 100644
--- a/src/TensorStats.cpp
+++ b/src/TensorStats.cpp
@@ -10,18 +10,18 @@
 
 int RAJA::tensor_stats::indent = 0;
 
-camp::idx_t RAJA::tensor_stats::num_vector_copy = 0;
-camp::idx_t RAJA::tensor_stats::num_vector_copy_ctor = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_copy           = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_copy_ctor      = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_broadcast_ctor = 0;
 
-camp::idx_t RAJA::tensor_stats::num_vector_load_packed = 0;
-camp::idx_t RAJA::tensor_stats::num_vector_load_packed_n = 0;
-camp::idx_t RAJA::tensor_stats::num_vector_load_strided = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_load_packed    = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_load_packed_n  = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_load_strided   = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_load_strided_n = 0;
 
-camp::idx_t RAJA::tensor_stats::num_vector_store_packed = 0;
-camp::idx_t RAJA::tensor_stats::num_vector_store_packed_n = 0;
-camp::idx_t RAJA::tensor_stats::num_vector_store_strided = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_store_packed    = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_store_packed_n  = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_store_strided   = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_store_strided_n = 0;
 
 camp::idx_t RAJA::tensor_stats::num_vector_broadcast = 0;
@@ -29,38 +29,39 @@ camp::idx_t RAJA::tensor_stats::num_vector_broadcast = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_get = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_set = 0;
 
-camp::idx_t RAJA::tensor_stats::num_vector_add = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_add      = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_subtract = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_multiply = 0;
-camp::idx_t RAJA::tensor_stats::num_vector_divide = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_divide   = 0;
 
 camp::idx_t RAJA::tensor_stats::num_vector_fma = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_fms = 0;
 
-camp::idx_t RAJA::tensor_stats::num_vector_sum = 0;
-camp::idx_t RAJA::tensor_stats::num_vector_max = 0;
-camp::idx_t RAJA::tensor_stats::num_vector_min = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_sum  = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_max  = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_min  = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_vmax = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_vmin = 0;
-camp::idx_t RAJA::tensor_stats::num_vector_dot = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_dot  = 0;
 
-camp::idx_t RAJA::tensor_stats::num_matrix_mm_mult_row_row = 0;
+camp::idx_t RAJA::tensor_stats::num_matrix_mm_mult_row_row    = 0;
 camp::idx_t RAJA::tensor_stats::num_matrix_mm_multacc_row_row = 0;
-camp::idx_t RAJA::tensor_stats::num_matrix_mm_mult_col_col = 0;
+camp::idx_t RAJA::tensor_stats::num_matrix_mm_mult_col_col    = 0;
 camp::idx_t RAJA::tensor_stats::num_matrix_mm_multacc_col_col = 0;
 
-void RAJA::tensor_stats::resetVectorStats(){
-  num_vector_copy = 0;
-  num_vector_copy_ctor = 0;
+void RAJA::tensor_stats::resetVectorStats()
+{
+  num_vector_copy           = 0;
+  num_vector_copy_ctor      = 0;
   num_vector_broadcast_ctor = 0;
 
-  num_vector_load_packed = 0;
-  num_vector_load_packed_n = 0;
-  num_vector_load_strided = 0;
-  num_vector_load_strided_n = 0;
-  num_vector_store_packed = 0;
-  num_vector_store_packed_n = 0;
-  num_vector_store_strided = 0;
+  num_vector_load_packed     = 0;
+  num_vector_load_packed_n   = 0;
+  num_vector_load_strided    = 0;
+  num_vector_load_strided_n  = 0;
+  num_vector_store_packed    = 0;
+  num_vector_store_packed_n  = 0;
+  num_vector_store_strided   = 0;
   num_vector_store_strided_n = 0;
 
   num_vector_broadcast = 0;
@@ -68,29 +69,34 @@ void RAJA::tensor_stats::resetVectorStats(){
   num_vector_get = 0;
   num_vector_set = 0;
 
-  num_vector_add = 0;
+  num_vector_add      = 0;
   num_vector_subtract = 0;
   num_vector_multiply = 0;
-  num_vector_divide = 0;
+  num_vector_divide   = 0;
 
-  num_vector_fma = 0;
-  num_vector_fms = 0;
-  num_vector_sum = 0;
-  num_vector_max = 0;
-  num_vector_min = 0;
+  num_vector_fma  = 0;
+  num_vector_fms  = 0;
+  num_vector_sum  = 0;
+  num_vector_max  = 0;
+  num_vector_min  = 0;
   num_vector_vmax = 0;
   num_vector_vmin = 0;
-  num_vector_dot = 0;
+  num_vector_dot  = 0;
 
-  num_matrix_mm_mult_row_row = 0;
+  num_matrix_mm_mult_row_row    = 0;
   num_matrix_mm_multacc_row_row = 0;
-  num_matrix_mm_mult_col_col = 0;
+  num_matrix_mm_mult_col_col    = 0;
   num_matrix_mm_multacc_col_col = 0;
 }
 
-#define PRINT_STAT(STAT) if(STAT){printf("  %-32s   %ld\n", #STAT, STAT);}
+#define PRINT_STAT(STAT)                                                       \
+  if (STAT)                                                                    \
+  {                                                                            \
+    printf("  %-32s   %ld\n", #STAT, STAT);                                    \
+  }
 
-void RAJA::tensor_stats::printVectorStats(){
+void RAJA::tensor_stats::printVectorStats()
+{
 
   printf("RAJA SIMD Register Statistics:\n");
 
@@ -129,5 +135,4 @@ void RAJA::tensor_stats::printVectorStats(){
   PRINT_STAT(num_matrix_mm_multacc_row_row);
   PRINT_STAT(num_matrix_mm_mult_col_col);
   PRINT_STAT(num_matrix_mm_multacc_col_col);
-
 }
diff --git a/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp b/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp
index 8c8d051d8f..fe6cb470bf 100644
--- a/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp
+++ b/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp
@@ -12,51 +12,51 @@
 #include <iostream>
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename POLICY_LIST>
-void DynamicForallResourceRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last, const int pol)
+void DynamicForallResourceRangeSegmentTestImpl(INDEX_TYPE first,
+                                               INDEX_TYPE last,
+                                               const int pol)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
+                                         RAJA::stripIndexType(last));
   INDEX_TYPE N = INDEX_TYPE(r1.end() - r1.begin());
 
   WORKING_RES working_res;
-  camp::resources::Resource erased_working_res{working_res};
+  camp::resources::Resource erased_working_res {working_res};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     erased_working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
+                                     &check_array, &test_array);
 
   const INDEX_TYPE rbegin = *r1.begin();
 
   std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
 
-  RAJA::expt::dynamic_forall<POLICY_LIST>(working_res, pol, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
-  });
+  RAJA::expt::dynamic_forall<POLICY_LIST>(
+      working_res, pol, r1,
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+      { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
 
-  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(check_array, working_array,
+                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
-
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
+                                       check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(DynamicForallResourceRangeSegmentTest);
 template <typename T>
 class DynamicForallResourceRangeSegmentTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(DynamicForallResourceRangeSegmentTest, RangeSegmentForallResource)
 {
@@ -70,42 +70,45 @@ TYPED_TEST_P(DynamicForallResourceRangeSegmentTest, RangeSegmentForallResource)
   constexpr int N = camp::size<POLICY_LIST>::value;
 #endif
 
-  //If N == 2 host, no openmp is available
-  //If N == 3 host, openmp is available
-  //If N == 4 host, device is available
-  //If N == 5 host, openmp, device are on
+  // If N == 2 host, no openmp is available
+  // If N == 3 host, openmp is available
+  // If N == 4 host, device is available
+  // If N == 5 host, openmp, device are on
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
-  bool is_on_host = working_res.get_platform() == camp::resources::Platform::host ? true : false;
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  bool is_on_host =
+      working_res.get_platform() == camp::resources::Platform::host ? true
+                                                                    : false;
 
-  if(is_on_host) { 
+  if (is_on_host)
+  {
     int host_range = 2;
 #if defined(RAJA_ENABLE_OPENMP)
-    host_range = 3; 
-#endif      
-      //Loop through policy list
-      for(int pol=0; pol<host_range; ++pol) 
-        {
-          DynamicForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, POLICY_LIST>
-            (INDEX_TYPE(0), INDEX_TYPE(27), pol);
-        }
+    host_range = 3;
+#endif
+    // Loop through policy list
+    for (int pol = 0; pol < host_range; ++pol)
+    {
+      DynamicForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES,
+                                                POLICY_LIST>(
+          INDEX_TYPE(0), INDEX_TYPE(27), pol);
+    }
   }
 #if defined(RAJA_GPU_ACTIVE)
   else
   {
     int device_start = 2;
 #if defined(RAJA_ENABLE_OPENMP)
-    device_start = 3; 
-#endif      
-    for(int pol=device_start; pol<N; ++pol) 
+    device_start = 3;
+#endif
+    for (int pol = device_start; pol < N; ++pol)
     {
-    DynamicForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, POLICY_LIST>
-      (INDEX_TYPE(0), INDEX_TYPE(27), pol);
+      DynamicForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES,
+                                                POLICY_LIST>(
+          INDEX_TYPE(0), INDEX_TYPE(27), pol);
     }
   }
 #endif
-
-
 }
 
 REGISTER_TYPED_TEST_SUITE_P(DynamicForallResourceRangeSegmentTest,
diff --git a/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp b/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp
index 11168b0e30..09dec1c458 100644
--- a/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp
+++ b/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp
@@ -12,60 +12,66 @@
 #include <iostream>
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename POLICY_LIST>
-void DynamicForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last, const int pol)
+void DynamicForallRangeSegmentTestImpl(INDEX_TYPE first,
+                                       INDEX_TYPE last,
+                                       const int pol)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
+                                         RAJA::stripIndexType(last));
   INDEX_TYPE N = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     const INDEX_TYPE rbegin = *r1.begin();
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
 
-    RAJA::expt::dynamic_forall<POLICY_LIST>(pol, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
-    });
-
-  } else { // zero-length segment 
+    RAJA::expt::dynamic_forall<POLICY_LIST>(
+        pol, r1,
+        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+        { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
+  }
+  else
+  {  // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
-
-    RAJA::expt::dynamic_forall<POLICY_LIST>(pol, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      (void) idx;
-      working_array[0]++;
-    });
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
+    RAJA::expt::dynamic_forall<POLICY_LIST>(pol, r1,
+                                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                                            {
+                                              (void)idx;
+                                              working_array[0]++;
+                                            });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -73,8 +79,7 @@ void DynamicForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last, const
 TYPED_TEST_SUITE_P(DynamicForallRangeSegmentTest);
 template <typename T>
 class DynamicForallRangeSegmentTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(DynamicForallRangeSegmentTest, RangeSegmentForall)
 {
@@ -87,45 +92,45 @@ TYPED_TEST_P(DynamicForallRangeSegmentTest, RangeSegmentForall)
   constexpr int N = camp::size<POLICY_LIST>::value;
 #endif
 
-  //If N == 2 host, no openmp is available
-  //If N == 3 host, openmp is available
-  //If N == 4 host, device is available
-  //If N == 5 host, openmp, device are on
+  // If N == 2 host, no openmp is available
+  // If N == 3 host, openmp is available
+  // If N == 4 host, device is available
+  // If N == 5 host, openmp, device are on
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
-  bool is_on_host = working_res.get_platform() == camp::resources::Platform::host ? true : false;
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  bool is_on_host =
+      working_res.get_platform() == camp::resources::Platform::host ? true
+                                                                    : false;
 
-  if(is_on_host) { 
+  if (is_on_host)
+  {
     int host_range = 2;
 #if defined(RAJA_ENABLE_OPENMP)
-    host_range = 3; 
-#endif      
-      //Loop through policy list
-      for(int pol=0; pol<host_range; ++pol) 
-        {
-          DynamicForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, POLICY_LIST>
-            (INDEX_TYPE(0), INDEX_TYPE(27), pol);
-        }
+    host_range = 3;
+#endif
+    // Loop through policy list
+    for (int pol = 0; pol < host_range; ++pol)
+    {
+      DynamicForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, POLICY_LIST>(
+          INDEX_TYPE(0), INDEX_TYPE(27), pol);
+    }
   }
 #if defined(RAJA_GPU_ACTIVE)
   else
   {
     int device_start = 2;
 #if defined(RAJA_ENABLE_OPENMP)
-    device_start = 3; 
-#endif      
-    for(int pol=device_start; pol<N; ++pol) 
+    device_start = 3;
+#endif
+    for (int pol = device_start; pol < N; ++pol)
     {
-    DynamicForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, POLICY_LIST>
-      (INDEX_TYPE(0), INDEX_TYPE(27), pol);
+      DynamicForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, POLICY_LIST>(
+          INDEX_TYPE(0), INDEX_TYPE(27), pol);
     }
   }
 #endif
-
-
 }
 
-REGISTER_TYPED_TEST_SUITE_P(DynamicForallRangeSegmentTest,
-                            RangeSegmentForall);
+REGISTER_TYPED_TEST_SUITE_P(DynamicForallRangeSegmentTest, RangeSegmentForall);
 
 #endif  // __TEST_BASIC_SHARED_HPP__
diff --git a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp
index 1b9dd4334a..1189dfc36a 100644
--- a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp
+++ b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp
@@ -16,56 +16,60 @@
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallCombiningAdapter1DTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first),
+                                         RAJA::stripIndexType(last));
   INDEX_TYPE N0 = static_cast<INDEX_TYPE>(r0.end() - r0.begin());
-  INDEX_TYPE N = N0;
+  INDEX_TYPE N  = N0;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  size_t data_len = RAJA::stripIndexType(N)+1;
+  size_t data_len = RAJA::stripIndexType(N) + 1;
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   {
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), first - first);
-    for (INDEX_TYPE i0 = INDEX_TYPE(0); i0 < N0; i0++) {
+    for (INDEX_TYPE i0 = INDEX_TYPE(0); i0 < N0; i0++)
+    {
       test_array[i0] = i0;
     }
     test_array[RAJA::stripIndexType(N)] = INDEX_TYPE(0);
 
     working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * data_len);
 
-    auto adapter = RAJA::make_CombiningAdapter([=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      if (idx >= first && idx < last) {
-        // in bounds
-        working_array[RAJA::stripIndexType(idx - first)] += (idx - first);
-      } else {
-        // out of bounds
-        working_array[RAJA::stripIndexType(N)]++;
-      }
-    }, r0);
+    auto adapter = RAJA::make_CombiningAdapter(
+        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+        {
+          if (idx >= first && idx < last)
+          {
+            // in bounds
+            working_array[RAJA::stripIndexType(idx - first)] += (idx - first);
+          }
+          else
+          {
+            // out of bounds
+            working_array[RAJA::stripIndexType(N)]++;
+          }
+        },
+        r0);
 
     RAJA::forall<EXEC_POLICY>(adapter.getRange(), adapter);
-
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -73,24 +77,31 @@ void ForallCombiningAdapter1DTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 TYPED_TEST_SUITE_P(ForallCombiningAdapter1DTest);
 template <typename T>
 class ForallCombiningAdapter1DTest : public ::testing::Test
-{
-};
+{};
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
-{
-}
+{}
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
-  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(-5));
+  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(-5));
 
-  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0));
-  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5));
+  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(0));
+  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(5));
 }
 
 
@@ -101,16 +112,19 @@ TYPED_TEST_P(ForallCombiningAdapter1DTest, Forall1D)
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // test zero-length range segment
-  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3));
+  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(3), INDEX_TYPE(3));
 
-  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(27));
-  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(2047));
-  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(32000));
+  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(27));
+  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(2047));
+  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(32000));
 
   runNegativeTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallCombiningAdapter1DTest,
-                            Forall1D);
+REGISTER_TYPED_TEST_SUITE_P(ForallCombiningAdapter1DTest, Forall1D);
 
 #endif  // __TEST_FORALL_CombiningAdapter_1D_HPP__
diff --git a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp
index 2be6464bb8..5b011f6c8b 100644
--- a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp
+++ b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp
@@ -14,32 +14,35 @@
 #include "RAJA/util/CombiningAdapter.hpp"
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void ForallCombiningAdapter2DTestImpl(INDEX_TYPE first0, INDEX_TYPE last0,
-                                  INDEX_TYPE first1, INDEX_TYPE last1)
+void ForallCombiningAdapter2DTestImpl(INDEX_TYPE first0,
+                                      INDEX_TYPE last0,
+                                      INDEX_TYPE first1,
+                                      INDEX_TYPE last1)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first0), RAJA::stripIndexType(last0));
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first1), RAJA::stripIndexType(last1));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first0),
+                                         RAJA::stripIndexType(last0));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first1),
+                                         RAJA::stripIndexType(last1));
   INDEX_TYPE N0 = static_cast<INDEX_TYPE>(r0.end() - r0.begin());
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
-  INDEX_TYPE N = N0 * N1;
+  INDEX_TYPE N  = N0 * N1;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  size_t data_len = RAJA::stripIndexType(N)+1;
+  size_t data_len = RAJA::stripIndexType(N) + 1;
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   {
 
-    for (INDEX_TYPE i0 = INDEX_TYPE(0); i0 < N0; i0++) {
-      for (INDEX_TYPE i1 = INDEX_TYPE(0); i1 < N1; i1++) {
+    for (INDEX_TYPE i0 = INDEX_TYPE(0); i0 < N0; i0++)
+    {
+      for (INDEX_TYPE i1 = INDEX_TYPE(0); i1 < N1; i1++)
+      {
         test_array[i0 * N1 + i1] = i0 * N1 + i1;
       }
     }
@@ -47,32 +50,36 @@ void ForallCombiningAdapter2DTestImpl(INDEX_TYPE first0, INDEX_TYPE last0,
 
     working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * data_len);
 
-    auto adapter = RAJA::make_CombiningAdapter([=] RAJA_HOST_DEVICE(INDEX_TYPE idx0, INDEX_TYPE idx1) {
-      if (idx0 >= first0 && idx0 < last0 &&
-          idx1 >= first1 && idx1 < last1) {
-        // in bounds
-        working_array[RAJA::stripIndexType((idx0 - first0) * N1 +
-                                           (idx1 - first1))] += (idx0 - first0) * N1 +
-                                                                (idx1 - first1);
-      } else {
-        // out of bounds
-        working_array[RAJA::stripIndexType(N)]++;
-      }
-    }, r0, r1);
+    auto adapter = RAJA::make_CombiningAdapter(
+        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx0, INDEX_TYPE idx1)
+        {
+          if (idx0 >= first0 && idx0 < last0 && idx1 >= first1 && idx1 < last1)
+          {
+            // in bounds
+            working_array[RAJA::stripIndexType((idx0 - first0) * N1 +
+                                               (idx1 - first1))] +=
+                (idx0 - first0) * N1 + (idx1 - first1);
+          }
+          else
+          {
+            // out of bounds
+            working_array[RAJA::stripIndexType(N)]++;
+          }
+        },
+        r0, r1);
 
     RAJA::forall<EXEC_POLICY>(adapter.getRange(), adapter);
-
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -80,27 +87,31 @@ void ForallCombiningAdapter2DTestImpl(INDEX_TYPE first0, INDEX_TYPE last0,
 TYPED_TEST_SUITE_P(ForallCombiningAdapter2DTest);
 template <typename T>
 class ForallCombiningAdapter2DTest : public ::testing::Test
-{
-};
+{};
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
-{
-}
+{}
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(-5),
-                                                                     INDEX_TYPE(-3), INDEX_TYPE(-3));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(-5), INDEX_TYPE(-3), INDEX_TYPE(-3));
 
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0),
-                                                                     INDEX_TYPE(-3), INDEX_TYPE(0));
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5),
-                                                                     INDEX_TYPE(-3), INDEX_TYPE(2));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(0), INDEX_TYPE(-3), INDEX_TYPE(0));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(5), INDEX_TYPE(-3), INDEX_TYPE(2));
 }
 
 
@@ -111,24 +122,23 @@ TYPED_TEST_P(ForallCombiningAdapter2DTest, Forall2D)
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // test zero-length range segment
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3),
-                                                                     INDEX_TYPE(5), INDEX_TYPE(5));
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3),
-                                                                     INDEX_TYPE(5), INDEX_TYPE(8));
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(6),
-                                                                     INDEX_TYPE(5), INDEX_TYPE(5));
-
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(15),
-                                                                     INDEX_TYPE(0), INDEX_TYPE(17));
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(57),
-                                                                     INDEX_TYPE(4), INDEX_TYPE(21));
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(13), INDEX_TYPE(156),
-                                                                     INDEX_TYPE(17), INDEX_TYPE(203));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(3), INDEX_TYPE(3), INDEX_TYPE(5), INDEX_TYPE(5));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(3), INDEX_TYPE(3), INDEX_TYPE(5), INDEX_TYPE(8));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(3), INDEX_TYPE(6), INDEX_TYPE(5), INDEX_TYPE(5));
+
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(15), INDEX_TYPE(0), INDEX_TYPE(17));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(57), INDEX_TYPE(4), INDEX_TYPE(21));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(13), INDEX_TYPE(156), INDEX_TYPE(17), INDEX_TYPE(203));
 
   runNegativeTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallCombiningAdapter2DTest,
-                            Forall2D);
+REGISTER_TYPED_TEST_SUITE_P(ForallCombiningAdapter2DTest, Forall2D);
 
 #endif  // __TEST_FORALL_CombiningAdapter_2D_HPP__
diff --git a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp
index 83213cc113..1b5611ee74 100644
--- a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp
+++ b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp
@@ -14,41 +14,43 @@
 #include "RAJA/util/CombiningAdapter.hpp"
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void ForallCombiningAdapter3DTestImpl(INDEX_TYPE first0, INDEX_TYPE last0,
-                                  INDEX_TYPE first1, INDEX_TYPE last1,
-                                  INDEX_TYPE first2, INDEX_TYPE last2)
+void ForallCombiningAdapter3DTestImpl(INDEX_TYPE first0,
+                                      INDEX_TYPE last0,
+                                      INDEX_TYPE first1,
+                                      INDEX_TYPE last1,
+                                      INDEX_TYPE first2,
+                                      INDEX_TYPE last2)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first0), RAJA::stripIndexType(last0));
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first1), RAJA::stripIndexType(last1));
-  RAJA::TypedRangeSegment<INDEX_TYPE> r2(RAJA::stripIndexType(first2), RAJA::stripIndexType(last2));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first0),
+                                         RAJA::stripIndexType(last0));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first1),
+                                         RAJA::stripIndexType(last1));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r2(RAJA::stripIndexType(first2),
+                                         RAJA::stripIndexType(last2));
   INDEX_TYPE N0 = static_cast<INDEX_TYPE>(r0.end() - r0.begin());
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
   INDEX_TYPE N2 = static_cast<INDEX_TYPE>(r2.end() - r2.begin());
-  INDEX_TYPE N = N0 * N1 * N2;
+  INDEX_TYPE N  = N0 * N1 * N2;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  size_t data_len = RAJA::stripIndexType(N)+1;
+  size_t data_len = RAJA::stripIndexType(N) + 1;
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   {
 
-    for (INDEX_TYPE i0 = INDEX_TYPE(0); i0 < N0; i0++) {
-      for (INDEX_TYPE i1 = INDEX_TYPE(0); i1 < N1; i1++) {
-        for (INDEX_TYPE i2 = INDEX_TYPE(0); i2 < N2; i2++) {
-          test_array[i0 * N1*N2 +
-                     i1 * N2 +
-                     i2] = i0 * N1 * N2 +
-                           i1 * N2 +
-                           i2;
+    for (INDEX_TYPE i0 = INDEX_TYPE(0); i0 < N0; i0++)
+    {
+      for (INDEX_TYPE i1 = INDEX_TYPE(0); i1 < N1; i1++)
+      {
+        for (INDEX_TYPE i2 = INDEX_TYPE(0); i2 < N2; i2++)
+        {
+          test_array[i0 * N1 * N2 + i1 * N2 + i2] = i0 * N1 * N2 + i1 * N2 + i2;
         }
       }
     }
@@ -56,35 +58,38 @@ void ForallCombiningAdapter3DTestImpl(INDEX_TYPE first0, INDEX_TYPE last0,
 
     working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * data_len);
 
-    auto adapter = RAJA::make_CombiningAdapter([=] RAJA_HOST_DEVICE(INDEX_TYPE idx0, INDEX_TYPE idx1, INDEX_TYPE idx2) {
-      if (idx0 >= first0 && idx0 < last0 &&
-          idx1 >= first1 && idx1 < last1 &&
-          idx2 >= first2 && idx2 < last2) {
-        // in bounds
-        working_array[RAJA::stripIndexType((idx0 - first0) * N1 * N2 +
-                                           (idx1 - first1) * N2 +
-                                           (idx2 - first2))] += (idx0 - first0) * N1 * N2 +
-                                                                (idx1 - first1) * N2 +
-                                                                (idx2 - first2);
-      } else {
-        // out of bounds
-        working_array[RAJA::stripIndexType(N)]++;
-      }
-    }, r0, r1, r2);
+    auto adapter = RAJA::make_CombiningAdapter(
+        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx0, INDEX_TYPE idx1, INDEX_TYPE idx2)
+        {
+          if (idx0 >= first0 && idx0 < last0 && idx1 >= first1 &&
+              idx1 < last1 && idx2 >= first2 && idx2 < last2)
+          {
+            // in bounds
+            working_array[RAJA::stripIndexType(
+                (idx0 - first0) * N1 * N2 + (idx1 - first1) * N2 +
+                (idx2 - first2))] += (idx0 - first0) * N1 * N2 +
+                                     (idx1 - first1) * N2 + (idx2 - first2);
+          }
+          else
+          {
+            // out of bounds
+            working_array[RAJA::stripIndexType(N)]++;
+          }
+        },
+        r0, r1, r2);
 
     RAJA::forall<EXEC_POLICY>(adapter.getRange(), adapter);
-
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -92,30 +97,34 @@ void ForallCombiningAdapter3DTestImpl(INDEX_TYPE first0, INDEX_TYPE last0,
 TYPED_TEST_SUITE_P(ForallCombiningAdapter3DTest);
 template <typename T>
 class ForallCombiningAdapter3DTest : public ::testing::Test
-{
-};
+{};
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
-{
-}
+{}
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(-5),
-                                                                     INDEX_TYPE(-3), INDEX_TYPE(-3),
-                                                                     INDEX_TYPE(-1), INDEX_TYPE(-1));
-
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0),
-                                                                     INDEX_TYPE(-3), INDEX_TYPE(0),
-                                                                     INDEX_TYPE(-4), INDEX_TYPE(0));
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5),
-                                                                     INDEX_TYPE(-3), INDEX_TYPE(2),
-                                                                     INDEX_TYPE(-7), INDEX_TYPE(-2));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(-5), INDEX_TYPE(-3), INDEX_TYPE(-3),
+      INDEX_TYPE(-1), INDEX_TYPE(-1));
+
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(0), INDEX_TYPE(-3), INDEX_TYPE(0),
+      INDEX_TYPE(-4), INDEX_TYPE(0));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(5), INDEX_TYPE(-3), INDEX_TYPE(2),
+      INDEX_TYPE(-7), INDEX_TYPE(-2));
 }
 
 
@@ -126,33 +135,32 @@ TYPED_TEST_P(ForallCombiningAdapter3DTest, Forall3D)
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // test zero-length range segment
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3),
-                                                                     INDEX_TYPE(5), INDEX_TYPE(5),
-                                                                     INDEX_TYPE(7), INDEX_TYPE(7));
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3),
-                                                                     INDEX_TYPE(5), INDEX_TYPE(6),
-                                                                     INDEX_TYPE(7), INDEX_TYPE(8));
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(4),
-                                                                     INDEX_TYPE(5), INDEX_TYPE(5),
-                                                                     INDEX_TYPE(7), INDEX_TYPE(8));
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(4),
-                                                                     INDEX_TYPE(5), INDEX_TYPE(6),
-                                                                     INDEX_TYPE(7), INDEX_TYPE(7));
-
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(7),
-                                                                     INDEX_TYPE(0), INDEX_TYPE(6),
-                                                                     INDEX_TYPE(0), INDEX_TYPE(3));
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(13),
-                                                                     INDEX_TYPE(4), INDEX_TYPE(17),
-                                                                     INDEX_TYPE(6), INDEX_TYPE(11));
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(13), INDEX_TYPE(46),
-                                                                     INDEX_TYPE(17), INDEX_TYPE(51),
-                                                                     INDEX_TYPE(4), INDEX_TYPE(31));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(3), INDEX_TYPE(3), INDEX_TYPE(5), INDEX_TYPE(5), INDEX_TYPE(7),
+      INDEX_TYPE(7));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(3), INDEX_TYPE(3), INDEX_TYPE(5), INDEX_TYPE(6), INDEX_TYPE(7),
+      INDEX_TYPE(8));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(3), INDEX_TYPE(4), INDEX_TYPE(5), INDEX_TYPE(5), INDEX_TYPE(7),
+      INDEX_TYPE(8));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(3), INDEX_TYPE(4), INDEX_TYPE(5), INDEX_TYPE(6), INDEX_TYPE(7),
+      INDEX_TYPE(7));
+
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(7), INDEX_TYPE(0), INDEX_TYPE(6), INDEX_TYPE(0),
+      INDEX_TYPE(3));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(13), INDEX_TYPE(4), INDEX_TYPE(17),
+      INDEX_TYPE(6), INDEX_TYPE(11));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(13), INDEX_TYPE(46), INDEX_TYPE(17), INDEX_TYPE(51),
+      INDEX_TYPE(4), INDEX_TYPE(31));
 
   runNegativeTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallCombiningAdapter3DTest,
-                            Forall3D);
+REGISTER_TYPED_TEST_SUITE_P(ForallCombiningAdapter3DTest, Forall3D);
 
 #endif  // __TEST_FORALL_CombiningAdapter_3D_HPP__
diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp
index a9e2c5a9f8..f3cfc532a0 100644
--- a/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp
+++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp
@@ -6,7 +6,8 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Header file containing basic functional tests for atomic operations with forall.
+/// Header file containing basic functional tests for atomic operations with
+/// forall.
 ///
 
 #ifndef __TEST_FORALL_ATOMIC_BASIC_HPP__
@@ -15,38 +16,40 @@
 #include <numeric>
 
 // segment multiplexer
-template< typename IdxType, typename SegType >
-struct RSMultiplexer {};
+template <typename IdxType, typename SegType>
+struct RSMultiplexer
+{};
 
-template< typename IdxType >
-struct RSMultiplexer < IdxType, RAJA::TypedRangeSegment<IdxType> >
+template <typename IdxType>
+struct RSMultiplexer<IdxType, RAJA::TypedRangeSegment<IdxType>>
 {
   RAJA::TypedRangeSegment<IdxType>
-  makeseg( IdxType N, camp::resources::Resource RAJA_UNUSED_ARG(work_res) )
+  makeseg(IdxType N, camp::resources::Resource RAJA_UNUSED_ARG(work_res))
   {
-    return RAJA::TypedRangeSegment<IdxType>( 0, N );
+    return RAJA::TypedRangeSegment<IdxType>(0, N);
   }
 };
 
-template< typename IdxType >
-struct RSMultiplexer < IdxType, RAJA::TypedRangeStrideSegment<IdxType> >
+template <typename IdxType>
+struct RSMultiplexer<IdxType, RAJA::TypedRangeStrideSegment<IdxType>>
 {
   RAJA::TypedRangeStrideSegment<IdxType>
-  makeseg( IdxType N, camp::resources::Resource RAJA_UNUSED_ARG(work_res) )
+  makeseg(IdxType N, camp::resources::Resource RAJA_UNUSED_ARG(work_res))
   {
-    return RAJA::TypedRangeStrideSegment<IdxType>( 0, N, 1 );
+    return RAJA::TypedRangeStrideSegment<IdxType>(0, N, 1);
   }
 };
 
-template< typename IdxType >
-struct RSMultiplexer < IdxType, RAJA::TypedListSegment<IdxType> >
+template <typename IdxType>
+struct RSMultiplexer<IdxType, RAJA::TypedListSegment<IdxType>>
 {
-  RAJA::TypedListSegment<IdxType>
-  makeseg( IdxType N, camp::resources::Resource work_res )
+  RAJA::TypedListSegment<IdxType> makeseg(IdxType N,
+                                          camp::resources::Resource work_res)
   {
     std::vector<IdxType> temp(N);
-    std::iota( std::begin(temp), std::end(temp), 0 );
-    return RAJA::TypedListSegment<IdxType>( &temp[0], static_cast<size_t>(temp.size()), work_res );
+    std::iota(std::begin(temp), std::end(temp), 0);
+    return RAJA::TypedListSegment<IdxType>(
+        &temp[0], static_cast<size_t>(temp.size()), work_res);
   }
 };
 // end segment multiplexer
@@ -58,58 +61,59 @@ template <typename ExecPolicy,
           typename IdxType,
           typename SegmentType,
           typename T>
-void ForallAtomicBasicTestImpl( IdxType seglimit )
+void ForallAtomicBasicTestImpl(IdxType seglimit)
 {
   // initialize an array
   const int len = 12;
 
-  camp::resources::Resource work_res{WORKINGRES()};
+  camp::resources::Resource work_res {WORKINGRES()};
 
-  SegmentType seg = 
-    RSMultiplexer<IdxType, SegmentType>().makeseg(seglimit, work_res);
+  SegmentType seg =
+      RSMultiplexer<IdxType, SegmentType>().makeseg(seglimit, work_res);
 
-  T * work_array;
-  T * test_array;
-  T * check_array;
+  T* work_array;
+  T* test_array;
+  T* check_array;
 
-  allocateForallTestData<T>(  len,
-                              work_res,
-                              &work_array,
-                              &check_array,
-                              &test_array );
+  allocateForallTestData<T>(len, work_res, &work_array, &check_array,
+                            &test_array);
 
   // use atomic add to reduce the array
-  test_array[0] = static_cast<T>(0);
-  test_array[1] = static_cast<T>(seglimit);
-  test_array[2] = static_cast<T>(seglimit);
-  test_array[3] = static_cast<T>(0);
-  test_array[4] = static_cast<T>(0);
-  test_array[5] = static_cast<T>(seglimit + 1);
-  test_array[6] = static_cast<T>(seglimit);
-  test_array[7] = static_cast<T>(0);
-  test_array[8] = static_cast<T>(0);
-  test_array[9] = static_cast<T>(0);
+  test_array[0]  = static_cast<T>(0);
+  test_array[1]  = static_cast<T>(seglimit);
+  test_array[2]  = static_cast<T>(seglimit);
+  test_array[3]  = static_cast<T>(0);
+  test_array[4]  = static_cast<T>(0);
+  test_array[5]  = static_cast<T>(seglimit + 1);
+  test_array[6]  = static_cast<T>(seglimit);
+  test_array[7]  = static_cast<T>(0);
+  test_array[8]  = static_cast<T>(0);
+  test_array[9]  = static_cast<T>(0);
   test_array[10] = static_cast<T>(0);
   test_array[11] = static_cast<T>(0);
 
   work_res.memcpy(work_array, test_array, sizeof(T) * len);
 
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-    RAJA::atomicAdd<AtomicPolicy>(work_array + 0, static_cast<T>(1));
-    RAJA::atomicSub<AtomicPolicy>(work_array + 1, static_cast<T>(1));
-    RAJA::atomicMin<AtomicPolicy>(work_array + 2, static_cast<T>(i));
-    RAJA::atomicMax<AtomicPolicy>(work_array + 3, static_cast<T>(i));
-    RAJA::atomicInc<AtomicPolicy>(work_array + 4);
-    RAJA::atomicDec<AtomicPolicy>(work_array + 5);
-    RAJA::atomicExchange<AtomicPolicy>(work_array + 6, static_cast<T>(i));
-    RAJA::atomicCAS<AtomicPolicy>(work_array + 7, static_cast<T>(i), static_cast<T>(i+1));
-    RAJA::atomicLoad<AtomicPolicy>(work_array + 8);
-    RAJA::atomicStore<AtomicPolicy>(work_array + 9, static_cast<T>(1));
-    RAJA::atomicInc<AtomicPolicy>(work_array + 10, static_cast<T>(16));
-    RAJA::atomicDec<AtomicPolicy>(work_array + 11, static_cast<T>(16));
-  });
-
-  work_res.memcpy( check_array, work_array, sizeof(T) * len );
+  RAJA::forall<ExecPolicy>(
+      seg,
+      [=] RAJA_HOST_DEVICE(IdxType i)
+      {
+        RAJA::atomicAdd<AtomicPolicy>(work_array + 0, static_cast<T>(1));
+        RAJA::atomicSub<AtomicPolicy>(work_array + 1, static_cast<T>(1));
+        RAJA::atomicMin<AtomicPolicy>(work_array + 2, static_cast<T>(i));
+        RAJA::atomicMax<AtomicPolicy>(work_array + 3, static_cast<T>(i));
+        RAJA::atomicInc<AtomicPolicy>(work_array + 4);
+        RAJA::atomicDec<AtomicPolicy>(work_array + 5);
+        RAJA::atomicExchange<AtomicPolicy>(work_array + 6, static_cast<T>(i));
+        RAJA::atomicCAS<AtomicPolicy>(work_array + 7, static_cast<T>(i),
+                                      static_cast<T>(i + 1));
+        RAJA::atomicLoad<AtomicPolicy>(work_array + 8);
+        RAJA::atomicStore<AtomicPolicy>(work_array + 9, static_cast<T>(1));
+        RAJA::atomicInc<AtomicPolicy>(work_array + 10, static_cast<T>(16));
+        RAJA::atomicDec<AtomicPolicy>(work_array + 11, static_cast<T>(16));
+      });
+
+  work_res.memcpy(check_array, work_array, sizeof(T) * len);
   work_res.wait();
 
   EXPECT_EQ(static_cast<T>(seglimit), check_array[0]);
@@ -127,17 +131,13 @@ void ForallAtomicBasicTestImpl( IdxType seglimit )
   EXPECT_EQ(static_cast<T>(4), check_array[10]);
   EXPECT_EQ(static_cast<T>(13), check_array[11]);
 
-  deallocateForallTestData<T>(work_res,
-                              work_array,
-                              check_array,
-                              test_array);
+  deallocateForallTestData<T>(work_res, work_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallAtomicBasicTest);
 template <typename T>
 class ForallAtomicBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallAtomicBasicTest, AtomicBasicForall)
 {
@@ -147,18 +147,15 @@ TYPED_TEST_P(ForallAtomicBasicTest, AtomicBasicForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicBasicTestImpl<AExec, APol, ResType, 
-                            IdxType, RAJA::TypedRangeSegment<IdxType>, 
-                            DType>(10000);
-  ForallAtomicBasicTestImpl<AExec, APol, ResType, 
-                            IdxType, RAJA::TypedRangeStrideSegment<IdxType>, 
-                            DType>(10000);
-  ForallAtomicBasicTestImpl<AExec, APol, ResType, 
-                            IdxType, RAJA::TypedListSegment<IdxType>, 
-                            DType>(10000);
+  ForallAtomicBasicTestImpl<AExec, APol, ResType, IdxType,
+                            RAJA::TypedRangeSegment<IdxType>, DType>(10000);
+  ForallAtomicBasicTestImpl<AExec, APol, ResType, IdxType,
+                            RAJA::TypedRangeStrideSegment<IdxType>, DType>(
+      10000);
+  ForallAtomicBasicTestImpl<AExec, APol, ResType, IdxType,
+                            RAJA::TypedListSegment<IdxType>, DType>(10000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicBasicTest,
-                            AtomicBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicBasicTest, AtomicBasicForall);
 
 #endif  //__TEST_FORALL_ATOMIC_BASIC_HPP__
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefAdd.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefAdd.hpp
index 9089844744..04eff1251e 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefAdd.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefAdd.hpp
@@ -6,98 +6,135 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing basic functional tests for addition arithmetic atomic operations using forall
+/// Source file containing basic functional tests for addition arithmetic atomic
+/// operations using forall
 ///
 
 #ifndef __TEST_FORALL_ATOMICREF_ADD_HPP__
 #define __TEST_FORALL_ATOMICREF_ADD_HPP__
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct PreIncCountOp {
-  PreIncCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)seg.size())
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct PreIncCountOp
+{
+  PreIncCountOp(T* dcount,
+                T* hcount,
+                camp::resources::Resource work_res,
+                RAJA::TypedRangeSegment<IdxType> seg)
+      : counter(dcount),
+        min((T)0),
+        max((T)seg.size() - (T)1),
+        final((T)seg.size())
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
-      return (++counter) - (T)1;
-    }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (++counter) - (T)1; }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct PostIncCountOp {
-  PostIncCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)seg.size())
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct PostIncCountOp
+{
+  PostIncCountOp(T* dcount,
+                 T* hcount,
+                 camp::resources::Resource work_res,
+                 RAJA::TypedRangeSegment<IdxType> seg)
+      : counter(dcount),
+        min((T)0),
+        max((T)seg.size() - (T)1),
+        final((T)seg.size())
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
-      return (counter++);
-    }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (counter++); }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct AddEqCountOp {
-  AddEqCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)seg.size())
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct AddEqCountOp
+{
+  AddEqCountOp(T* dcount,
+               T* hcount,
+               camp::resources::Resource work_res,
+               RAJA::TypedRangeSegment<IdxType> seg)
+      : counter(dcount),
+        min((T)0),
+        max((T)seg.size() - (T)1),
+        final((T)seg.size())
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
-      return (counter += (T)1) - (T)1;
-    }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const
+  {
+    return (counter += (T)1) - (T)1;
+  }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct FetchAddCountOp {
-  FetchAddCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)seg.size())
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct FetchAddCountOp
+{
+  FetchAddCountOp(T* dcount,
+                  T* hcount,
+                  camp::resources::Resource work_res,
+                  RAJA::TypedRangeSegment<IdxType> seg)
+      : counter(dcount),
+        min((T)0),
+        max((T)seg.size() - (T)1),
+        final((T)seg.size())
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
-      return counter.fetch_add((T)1);
-    }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const
+  {
+    return counter.fetch_add((T)1);
+  }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
 template <typename ExecPolicy,
-         typename AtomicPolicy,
-         typename IdxType,
-         typename T,
-         template <typename, typename, typename> class CountOp>
+          typename AtomicPolicy,
+          typename IdxType,
+          typename T,
+          template <typename, typename, typename>
+          class CountOp>
 void testAtomicRefAdd(RAJA::TypedRangeSegment<IdxType> seg,
-    T* count, T* list, bool* hit,
-    T* hcount, T* hlist, bool* hhit,
-    camp::resources::Resource work_res, IdxType N)
+                      T* count,
+                      T* list,
+                      bool* hit,
+                      T* hcount,
+                      T* hlist,
+                      bool* hhit,
+                      camp::resources::Resource work_res,
+                      IdxType N)
 {
   CountOp<T, AtomicPolicy, IdxType> countop(count, hcount, work_res, seg);
 
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      list[i] = countop.max + (T)1;
-      hit[i] = false;
-      });
-
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      T val = countop(i);
-      list[i] = val;
-      hit[(IdxType)val] = true;
-      });
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             list[i] = countop.max + (T)1;
+                             hit[i]  = false;
+                           });
+
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             T val             = countop(i);
+                             list[i]           = val;
+                             hit[(IdxType)val] = true;
+                           });
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -106,9 +143,9 @@ void testAtomicRefAdd(RAJA::TypedRangeSegment<IdxType> seg,
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  work_res.memcpy( hcount, count, sizeof(T) );
-  work_res.memcpy( hlist, list, sizeof(T) * N );
-  work_res.memcpy( hhit, hit, sizeof(bool) * N );
+  work_res.memcpy(hcount, count, sizeof(T));
+  work_res.memcpy(hlist, list, sizeof(T) * N);
+  work_res.memcpy(hhit, hit, sizeof(bool) * N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -119,7 +156,8 @@ void testAtomicRefAdd(RAJA::TypedRangeSegment<IdxType> seg,
 #endif
 
   EXPECT_EQ(countop.final, hcount[0]);
-  for (IdxType i = 0; i < seg.size(); i++) {
+  for (IdxType i = 0; i < seg.size(); i++)
+  {
     EXPECT_LE(countop.min, hlist[i]);
     EXPECT_GE(countop.max, hlist[i]);
     EXPECT_TRUE(hhit[i]);
@@ -132,21 +170,21 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicRefAddTestImpl( IdxType N )
+void ForallAtomicRefAddTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
-  camp::resources::Resource work_res{WORKINGRES()};
+  camp::resources::Resource work_res {WORKINGRES()};
 
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
-  T * count   = work_res.allocate<T>(1);
-  T * list    = work_res.allocate<T>(N);
-  bool * hit  = work_res.allocate<bool>(N);
+  T* count  = work_res.allocate<T>(1);
+  T* list   = work_res.allocate<T>(N);
+  bool* hit = work_res.allocate<bool>(N);
 
-  T * hcount   = host_res.allocate<T>(1);
-  T * hlist    = host_res.allocate<T>(N);
-  bool * hhit  = host_res.allocate<bool>(N);
+  T* hcount  = host_res.allocate<T>(1);
+  T* hlist   = host_res.allocate<T>(N);
+  bool* hhit = host_res.allocate<bool>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -156,29 +194,28 @@ void ForallAtomicRefAddTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, 
-                     PreIncCountOp  >(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, 
-                     PostIncCountOp >(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, 
-                     AddEqCountOp   >(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, 
-                     FetchAddCountOp>(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-
-  work_res.deallocate( count );
-  work_res.deallocate( list );
-  work_res.deallocate( hit );
-  host_res.deallocate( hcount );
-  host_res.deallocate( hlist );
-  host_res.deallocate( hhit ); 
+  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, PreIncCountOp>(
+      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, PostIncCountOp>(
+      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, AddEqCountOp>(
+      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, FetchAddCountOp>(
+      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+
+  work_res.deallocate(count);
+  work_res.deallocate(list);
+  work_res.deallocate(hit);
+  host_res.deallocate(hcount);
+  host_res.deallocate(hlist);
+  host_res.deallocate(hhit);
 }
 
 
 TYPED_TEST_SUITE_P(ForallAtomicRefAddTest);
 template <typename T>
 class ForallAtomicRefAddTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallAtomicRefAddTest, AtomicRefAddForall)
 {
@@ -188,10 +225,9 @@ TYPED_TEST_P(ForallAtomicRefAddTest, AtomicRefAddForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicRefAddTestImpl<AExec, APol, ResType, IdxType, DType>( 10000 );
+  ForallAtomicRefAddTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefAddTest,
-                            AtomicRefAddForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefAddTest, AtomicRefAddForall);
 
 #endif  //__TEST_FORALL_ATOMICREF_ADD_HPP__
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp
index 8f036fc4b9..97dccdfead 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp
@@ -6,92 +6,124 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing basic functional tests for CAS atomic operations using forall
+/// Source file containing basic functional tests for CAS atomic operations
+/// using forall
 ///
 
 #ifndef __TEST_FORALL_ATOMICREF_CAS_HPP__
 #define __TEST_FORALL_ATOMICREF_CAS_HPP__
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct CASOtherOp : all_op {
-  CASOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min((T)0), max((T)seg.size() - (T)1),
-    final_min(min), final_max(max)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct CASOtherOp : all_op
+{
+  CASOtherOp(T* dcount,
+             T* hcount,
+             camp::resources::Resource work_res,
+             RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min((T)0),
+        max((T)seg.size() - (T)1),
+        final_min(min),
+        final_max(max)
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
+  T operator()(IdxType i) const
+  {
+    T received, expect = (T)0;
+    while ((received = other.CAS(expect, (T)i)) != expect)
     {
-      T received, expect = (T)0;
-      while ((received = other.CAS(expect, (T)i)) != expect) {
-        expect = received;
-      }
-      return received;
+      expect = received;
     }
+    return received;
+  }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct CompareExchangeWeakOtherOp : all_op {
-  CompareExchangeWeakOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min((T)0), max((T)seg.size() - (T)1),
-    final_min(min), final_max(max)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct CompareExchangeWeakOtherOp : all_op
+{
+  CompareExchangeWeakOtherOp(T* dcount,
+                             T* hcount,
+                             camp::resources::Resource work_res,
+                             RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min((T)0),
+        max((T)seg.size() - (T)1),
+        final_min(min),
+        final_max(max)
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
+  T operator()(IdxType i) const
+  {
+    T expect = (T)0;
+    while (!other.compare_exchange_weak(expect, (T)i))
     {
-      T expect = (T)0;
-      while (!other.compare_exchange_weak(expect, (T)i)) {}
-      return expect;
     }
+    return expect;
+  }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct CompareExchangeStrongOtherOp : all_op {
-  CompareExchangeStrongOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min((T)0), max((T)seg.size() - (T)1),
-    final_min(min), final_max(max)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct CompareExchangeStrongOtherOp : all_op
+{
+  CompareExchangeStrongOtherOp(T* dcount,
+                               T* hcount,
+                               camp::resources::Resource work_res,
+                               RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min((T)0),
+        max((T)seg.size() - (T)1),
+        final_min(min),
+        final_max(max)
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
+  T operator()(IdxType i) const
+  {
+    T expect = (T)0;
+    while (!other.compare_exchange_strong(expect, (T)i))
     {
-      T expect = (T)0;
-      while (!other.compare_exchange_strong(expect, (T)i)) {}
-      return expect;
     }
+    return expect;
+  }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template  < typename ExecPolicy,
-            typename AtomicPolicy,
-            typename IdxType,
-            typename T,
-            template <typename, typename, typename> class OtherOp>
-void
-testAtomicRefCASOp(RAJA::TypedRangeSegment<IdxType> seg, T* count, T* list,
-    T* hcount, T* hlist,
-    camp::resources::Resource work_res, IdxType N)
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename IdxType,
+          typename T,
+          template <typename, typename, typename>
+          class OtherOp>
+void testAtomicRefCASOp(RAJA::TypedRangeSegment<IdxType> seg,
+                        T* count,
+                        T* list,
+                        T* hcount,
+                        T* hlist,
+                        camp::resources::Resource work_res,
+                        IdxType N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      list[i] = otherop.max + (T)1;
-  });
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      T val = otherop(i);
-      list[i] = val;
-  });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
+                           { list[i] = otherop.max + (T)1; });
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             T val   = otherop(i);
+                             list[i] = val;
+                           });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -99,12 +131,13 @@ testAtomicRefCASOp(RAJA::TypedRangeSegment<IdxType> seg, T* count, T* list,
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  work_res.memcpy( hcount, count, sizeof(T) );
-  work_res.memcpy( hlist, list, sizeof(T) * N );
+  work_res.memcpy(hcount, count, sizeof(T));
+  work_res.memcpy(hlist, list, sizeof(T) * N);
 
   EXPECT_LE(otherop.final_min, hcount[0]);
   EXPECT_GE(otherop.final_max, hcount[0]);
-  for (IdxType i = 0; i < seg.size(); i++) {
+  for (IdxType i = 0; i < seg.size(); i++)
+  {
     EXPECT_LE(otherop.min, hlist[i]);
     EXPECT_GE(otherop.max, hlist[i]);
   }
@@ -116,19 +149,19 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicRefCASTestImpl( IdxType N )
+void ForallAtomicRefCASTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
-  camp::resources::Resource work_res{WORKINGRES()};
+  camp::resources::Resource work_res {WORKINGRES()};
 
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
-  T * count   = work_res.allocate<T>(1);
-  T * list    = work_res.allocate<T>(N);
+  T* count = work_res.allocate<T>(1);
+  T* list  = work_res.allocate<T>(N);
 
-  T * hcount   = host_res.allocate<T>(1);
-  T * hlist    = host_res.allocate<T>(N);
+  T* hcount = host_res.allocate<T>(1);
+  T* hlist  = host_res.allocate<T>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -138,25 +171,26 @@ void ForallAtomicRefCASTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       CASOtherOp                  >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       CompareExchangeWeakOtherOp  >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       CompareExchangeStrongOtherOp>(seg, count, list, hcount, hlist, work_res, N);
-
-  work_res.deallocate( count );
-  work_res.deallocate( list );
-  host_res.deallocate( hcount );
-  host_res.deallocate( hlist );
+  testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T, CASOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T,
+                     CompareExchangeWeakOtherOp>(seg, count, list, hcount,
+                                                 hlist, work_res, N);
+  testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T,
+                     CompareExchangeStrongOtherOp>(seg, count, list, hcount,
+                                                   hlist, work_res, N);
+
+  work_res.deallocate(count);
+  work_res.deallocate(list);
+  host_res.deallocate(hcount);
+  host_res.deallocate(hlist);
 }
 
 
 TYPED_TEST_SUITE_P(ForallAtomicRefCASTest);
 template <typename T>
 class ForallAtomicRefCASTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallAtomicRefCASTest, AtomicRefCASForall)
 {
@@ -166,10 +200,9 @@ TYPED_TEST_P(ForallAtomicRefCASTest, AtomicRefCASForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicRefCASTestImpl<AExec, APol, ResType, IdxType, DType>( 10000 );
+  ForallAtomicRefCASTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefCASTest,
-                            AtomicRefCASForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefCASTest, AtomicRefCASForall);
 
 #endif  //__TEST_FORALL_ATOMICREF_CAS_HPP__
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp
index 95209b6c79..8bb250d339 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp
@@ -6,94 +6,124 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing basic functional tests for load/store atomic operations using forall
+/// Source file containing basic functional tests for load/store atomic
+/// operations using forall
 ///
 
 #ifndef __TEST_FORALL_ATOMICREF_LOADSTORE_HPP__
 #define __TEST_FORALL_ATOMICREF_LOADSTORE_HPP__
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct LoadOtherOp : all_op {
-  LoadOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min((T)seg.size()), max(min),
-    final_min(min), final_max(min)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct LoadOtherOp : all_op
+{
+  LoadOtherOp(T* dcount,
+              T* hcount,
+              camp::resources::Resource work_res,
+              RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min((T)seg.size()),
+        max(min),
+        final_min(min),
+        final_max(min)
   {
     hcount[0] = min;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const
-    { return other.load(); }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return other.load(); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct OperatorTOtherOp : all_op {
-  OperatorTOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> RAJA_UNUSED_ARG(seg))
-    : other(dcount), min(T(0)), max(min),
-    final_min(min), final_max(min)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct OperatorTOtherOp : all_op
+{
+  OperatorTOtherOp(T* dcount,
+                   T* hcount,
+                   camp::resources::Resource work_res,
+                   RAJA::TypedRangeSegment<IdxType> RAJA_UNUSED_ARG(seg))
+      : other(dcount), min(T(0)), max(min), final_min(min), final_max(min)
   {
     hcount[0] = min;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const
-    { return other; }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return other; }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct StoreOtherOp : all_op {
-  StoreOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min((T)0), max((T)seg.size() - (T)1),
-    final_min(min), final_max(max)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct StoreOtherOp : all_op
+{
+  StoreOtherOp(T* dcount,
+               T* hcount,
+               camp::resources::Resource work_res,
+               RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min((T)0),
+        max((T)seg.size() - (T)1),
+        final_min(min),
+        final_max(max)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { other.store((T)i); return (T)i; }
+  T operator()(IdxType i) const
+  {
+    other.store((T)i);
+    return (T)i;
+  }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct AssignOtherOp : all_op {
-  AssignOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max((T)seg.size() - (T)1),
-    final_min(min), final_max(max)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct AssignOtherOp : all_op
+{
+  AssignOtherOp(T* dcount,
+                T* hcount,
+                camp::resources::Resource work_res,
+                RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max((T)seg.size() - (T)1),
+        final_min(min),
+        final_max(max)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return (other = (T)i); }
+  T operator()(IdxType i) const { return (other = (T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template  < typename ExecPolicy,
-            typename AtomicPolicy,
-            typename IdxType,
-            typename T,
-            template <typename, typename, typename> class OtherOp>
-void
-testAtomicRefLoadStoreOp(RAJA::TypedRangeSegment<IdxType> seg, T* count, T* list,
-    T* hcount, T* hlist,
-    camp::resources::Resource work_res, IdxType N)
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename IdxType,
+          typename T,
+          template <typename, typename, typename>
+          class OtherOp>
+void testAtomicRefLoadStoreOp(RAJA::TypedRangeSegment<IdxType> seg,
+                              T* count,
+                              T* list,
+                              T* hcount,
+                              T* hlist,
+                              camp::resources::Resource work_res,
+                              IdxType N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      list[i] = otherop.max + (T)1;
-  });
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      T val = otherop(i);
-      list[i] = val;
-  });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
+                           { list[i] = otherop.max + (T)1; });
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             T val   = otherop(i);
+                             list[i] = val;
+                           });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -101,12 +131,13 @@ testAtomicRefLoadStoreOp(RAJA::TypedRangeSegment<IdxType> seg, T* count, T* list
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  work_res.memcpy( hcount, count, sizeof(T) );
-  work_res.memcpy( hlist, list, sizeof(T) * N );
+  work_res.memcpy(hcount, count, sizeof(T));
+  work_res.memcpy(hlist, list, sizeof(T) * N);
 
   EXPECT_LE(otherop.final_min, hcount[0]);
   EXPECT_GE(otherop.final_max, hcount[0]);
-  for (IdxType i = 0; i < seg.size(); i++) {
+  for (IdxType i = 0; i < seg.size(); i++)
+  {
     EXPECT_LE(otherop.min, hlist[i]);
     EXPECT_GE(otherop.max, hlist[i]);
   }
@@ -118,19 +149,19 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicRefLoadStoreTestImpl( IdxType N )
+void ForallAtomicRefLoadStoreTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
-  camp::resources::Resource work_res{WORKINGRES()};
+  camp::resources::Resource work_res {WORKINGRES()};
 
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
-  T * count   = work_res.allocate<T>(1);
-  T * list    = work_res.allocate<T>(N);
+  T* count = work_res.allocate<T>(1);
+  T* list  = work_res.allocate<T>(N);
 
-  T * hcount   = host_res.allocate<T>(1);
-  T * hlist    = host_res.allocate<T>(N);
+  T* hcount = host_res.allocate<T>(1);
+  T* hlist  = host_res.allocate<T>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -140,27 +171,27 @@ void ForallAtomicRefLoadStoreTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       LoadOtherOp     >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       OperatorTOtherOp>(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       StoreOtherOp    >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       AssignOtherOp   >(seg, count, list, hcount, hlist, work_res, N);
-
-  work_res.deallocate( count );
-  work_res.deallocate( list );
-  host_res.deallocate( hcount );
-  host_res.deallocate( hlist );
+  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, LoadOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T,
+                           OperatorTOtherOp>(seg, count, list, hcount, hlist,
+                                             work_res, N);
+  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, StoreOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, AssignOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+
+  work_res.deallocate(count);
+  work_res.deallocate(list);
+  host_res.deallocate(hcount);
+  host_res.deallocate(hlist);
 }
 
 
 TYPED_TEST_SUITE_P(ForallAtomicRefLoadStoreTest);
 template <typename T>
 class ForallAtomicRefLoadStoreTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallAtomicRefLoadStoreTest, AtomicRefLoadStoreForall)
 {
@@ -170,7 +201,7 @@ TYPED_TEST_P(ForallAtomicRefLoadStoreTest, AtomicRefLoadStoreForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicRefLoadStoreTestImpl<AExec, APol, ResType, IdxType, DType>( 10000 );
+  ForallAtomicRefLoadStoreTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefLoadStoreTest,
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp
index 382560109c..49ec06689a 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp
@@ -6,154 +6,202 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing basic functional tests for logical atomic operations using forall
+/// Source file containing basic functional tests for logical atomic operations
+/// using forall
 ///
 
 #ifndef __TEST_FORALL_ATOMICREF_LOGICAL_HPP__
 #define __TEST_FORALL_ATOMICREF_LOGICAL_HPP__
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct AndEqOtherOp : int_op {
-  AndEqOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max((T)seg.size()),
-    final_min(min), final_max(min)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct AndEqOtherOp : int_op
+{
+  AndEqOtherOp(T* dcount,
+               T* hcount,
+               camp::resources::Resource work_res,
+               RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max((T)seg.size()),
+        final_min(min),
+        final_max(min)
   {
     hcount[0] = np2m1((T)seg.size());
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other &= (T)i; }
+  T operator()(IdxType i) const { return other &= (T)i; }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct FetchAndOtherOp : int_op {
-  FetchAndOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max(np2m1((T)seg.size())),
-    final_min(min), final_max(min)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct FetchAndOtherOp : int_op
+{
+  FetchAndOtherOp(T* dcount,
+                  T* hcount,
+                  camp::resources::Resource work_res,
+                  RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max(np2m1((T)seg.size())),
+        final_min(min),
+        final_max(min)
   {
     hcount[0] = max;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other.fetch_and((T)i); }
+  T operator()(IdxType i) const { return other.fetch_and((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct OrEqOtherOp : int_op {
-  OrEqOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max(np2m1((T)seg.size())),
-    final_min(max), final_max(max)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct OrEqOtherOp : int_op
+{
+  OrEqOtherOp(T* dcount,
+              T* hcount,
+              camp::resources::Resource work_res,
+              RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max(np2m1((T)seg.size())),
+        final_min(max),
+        final_max(max)
   {
     hcount[0] = T(0);
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other |= (T)i; }
+  T operator()(IdxType i) const { return other |= (T)i; }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct FetchOrOtherOp : int_op {
-  FetchOrOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max(np2m1((T)seg.size())),
-    final_min(max), final_max(max)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct FetchOrOtherOp : int_op
+{
+  FetchOrOtherOp(T* dcount,
+                 T* hcount,
+                 camp::resources::Resource work_res,
+                 RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max(np2m1((T)seg.size())),
+        final_min(max),
+        final_max(max)
   {
     hcount[0] = T(0);
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other.fetch_or((T)i); }
+  T operator()(IdxType i) const { return other.fetch_or((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct XorEqOtherOp : int_op {
-  XorEqOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max(np2m1((T)seg.size())),
-    final_min(min), final_max(min)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct XorEqOtherOp : int_op
+{
+  XorEqOtherOp(T* dcount,
+               T* hcount,
+               camp::resources::Resource work_res,
+               RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max(np2m1((T)seg.size())),
+        final_min(min),
+        final_max(min)
   {
     hcount[0] = T(0);
     work_res.memcpy(dcount, hcount, sizeof(T));
-    for (IdxType i = 0; i < seg.size(); ++i) {
-      final_min ^= (T)i; final_max ^= (T)i;
+    for (IdxType i = 0; i < seg.size(); ++i)
+    {
+      final_min ^= (T)i;
+      final_max ^= (T)i;
     }
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other ^= (T)i; }
+  T operator()(IdxType i) const { return other ^= (T)i; }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct FetchXorOtherOp : int_op {
-  FetchXorOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max(np2m1((T)seg.size())),
-    final_min(min), final_max(min)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct FetchXorOtherOp : int_op
+{
+  FetchXorOtherOp(T* dcount,
+                  T* hcount,
+                  camp::resources::Resource work_res,
+                  RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max(np2m1((T)seg.size())),
+        final_min(min),
+        final_max(min)
   {
     hcount[0] = T(0);
     work_res.memcpy(dcount, hcount, sizeof(T));
-    for (IdxType i = 0; i < seg.size(); ++i) {
-      final_min ^= (T)i; final_max ^= (T)i;
+    for (IdxType i = 0; i < seg.size(); ++i)
+    {
+      final_min ^= (T)i;
+      final_max ^= (T)i;
     }
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other.fetch_xor((T)i); }
+  T operator()(IdxType i) const { return other.fetch_xor((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template  < typename ExecPolicy,
-            typename AtomicPolicy,
-            typename IdxType,
-            typename T,
-            template <typename, typename, typename> class OtherOp>
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename IdxType,
+          typename T,
+          template <typename, typename, typename>
+          class OtherOp>
 // No test when underlying op type is int, and index type is float
 typename std::enable_if<
-           (std::is_floating_point<T>::value && 
-            std::is_base_of<int_op, OtherOp<T,AtomicPolicy, IdxType>>::value)
-         >::type
-testAtomicRefLogicalOp(RAJA::TypedRangeSegment<IdxType> RAJA_UNUSED_ARG(seg), 
-                     T* RAJA_UNUSED_ARG(count), T* RAJA_UNUSED_ARG(list),
-                     T* RAJA_UNUSED_ARG(hcount), T* RAJA_UNUSED_ARG(hlist),
-                     camp::resources::Resource RAJA_UNUSED_ARG(work_res), IdxType RAJA_UNUSED_ARG(N))
-{
-}
+    (std::is_floating_point<T>::value &&
+     std::is_base_of<int_op, OtherOp<T, AtomicPolicy, IdxType>>::value)>::type
+testAtomicRefLogicalOp(RAJA::TypedRangeSegment<IdxType> RAJA_UNUSED_ARG(seg),
+                       T* RAJA_UNUSED_ARG(count),
+                       T* RAJA_UNUSED_ARG(list),
+                       T* RAJA_UNUSED_ARG(hcount),
+                       T* RAJA_UNUSED_ARG(hlist),
+                       camp::resources::Resource RAJA_UNUSED_ARG(work_res),
+                       IdxType RAJA_UNUSED_ARG(N))
+{}
 
-template  < typename ExecPolicy,
-            typename AtomicPolicy,
-            typename IdxType,
-            typename T,
-            template <typename, typename, typename> class OtherOp>
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename IdxType,
+          typename T,
+          template <typename, typename, typename>
+          class OtherOp>
 // Run test if T is integral and operation is int_op, or for any all_op
 typename std::enable_if<
-           (std::is_integral<T>::value && 
-            std::is_base_of<int_op, OtherOp<T,AtomicPolicy, IdxType>>::value) || 
-            (std::is_base_of<all_op, OtherOp<T,AtomicPolicy, IdxType>>::value)
-         >::type
-testAtomicRefLogicalOp(RAJA::TypedRangeSegment<IdxType> seg, T* count, T* list,
-    T* hcount, T* hlist,
-    camp::resources::Resource work_res, IdxType N)
+    (std::is_integral<T>::value &&
+     std::is_base_of<int_op, OtherOp<T, AtomicPolicy, IdxType>>::value) ||
+    (std::is_base_of<all_op, OtherOp<T, AtomicPolicy, IdxType>>::value)>::type
+testAtomicRefLogicalOp(RAJA::TypedRangeSegment<IdxType> seg,
+                       T* count,
+                       T* list,
+                       T* hcount,
+                       T* hlist,
+                       camp::resources::Resource work_res,
+                       IdxType N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      list[i] = otherop.max + (T)1;
-  });
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      T val = otherop(i);
-      list[i] = val;
-  });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
+                           { list[i] = otherop.max + (T)1; });
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             T val   = otherop(i);
+                             list[i] = val;
+                           });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -161,12 +209,13 @@ testAtomicRefLogicalOp(RAJA::TypedRangeSegment<IdxType> seg, T* count, T* list,
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  work_res.memcpy( hcount, count, sizeof(T) );
-  work_res.memcpy( hlist, list, sizeof(T) * N );
+  work_res.memcpy(hcount, count, sizeof(T));
+  work_res.memcpy(hlist, list, sizeof(T) * N);
 
   EXPECT_LE(otherop.final_min, hcount[0]);
   EXPECT_GE(otherop.final_max, hcount[0]);
-  for (IdxType i = 0; i < seg.size(); i++) {
+  for (IdxType i = 0; i < seg.size(); i++)
+  {
     EXPECT_LE(otherop.min, hlist[i]);
     EXPECT_GE(otherop.max, hlist[i]);
   }
@@ -178,19 +227,19 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicRefLogicalTestImpl( IdxType N )
+void ForallAtomicRefLogicalTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
-  camp::resources::Resource work_res{WORKINGRES()};
+  camp::resources::Resource work_res {WORKINGRES()};
 
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
-  T * count   = work_res.allocate<T>(1);
-  T * list    = work_res.allocate<T>(N);
+  T* count = work_res.allocate<T>(1);
+  T* list  = work_res.allocate<T>(N);
 
-  T * hcount   = host_res.allocate<T>(1);
-  T * hlist    = host_res.allocate<T>(N);
+  T* hcount = host_res.allocate<T>(1);
+  T* hlist  = host_res.allocate<T>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -200,33 +249,32 @@ void ForallAtomicRefLogicalTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  // Note: These integral tests require return type conditional overloading 
+  // Note: These integral tests require return type conditional overloading
   //       of testAtomicRefLogicalOp
-  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       AndEqOtherOp   >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       FetchAndOtherOp>(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       OrEqOtherOp    >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       FetchOrOtherOp >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       XorEqOtherOp   >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       FetchXorOtherOp>(seg, count, list, hcount, hlist, work_res, N);
-
-  work_res.deallocate( count );
-  work_res.deallocate( list );
-  host_res.deallocate( hcount );
-  host_res.deallocate( hlist );
+  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, AndEqOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, FetchAndOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, OrEqOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, FetchOrOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, XorEqOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, FetchXorOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+
+  work_res.deallocate(count);
+  work_res.deallocate(list);
+  host_res.deallocate(hcount);
+  host_res.deallocate(hlist);
 }
 
 
 TYPED_TEST_SUITE_P(ForallAtomicRefLogicalTest);
 template <typename T>
 class ForallAtomicRefLogicalTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallAtomicRefLogicalTest, AtomicRefLogicalForall)
 {
@@ -236,10 +284,9 @@ TYPED_TEST_P(ForallAtomicRefLogicalTest, AtomicRefLogicalForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicRefLogicalTestImpl<AExec, APol, ResType, IdxType, DType>( 10000 );
+  ForallAtomicRefLogicalTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefLogicalTest,
-                            AtomicRefLogicalForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefLogicalTest, AtomicRefLogicalForall);
 
 #endif  //__TEST_FORALL_ATOMICREF_LOGICAL_HPP__
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp
index b8860def9f..02547f773a 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp
@@ -6,94 +6,124 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing basic functional tests for min/max atomic operations using forall
+/// Source file containing basic functional tests for min/max atomic operations
+/// using forall
 ///
 
 #ifndef __TEST_FORALL_ATOMICREF_MINMAX_HPP__
 #define __TEST_FORALL_ATOMICREF_MINMAX_HPP__
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct MaxEqOtherOp : all_op {
-  MaxEqOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max((T)seg.size() - (T)1),
-    final_min(max), final_max(max)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct MaxEqOtherOp : all_op
+{
+  MaxEqOtherOp(T* dcount,
+               T* hcount,
+               camp::resources::Resource work_res,
+               RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max((T)seg.size() - (T)1),
+        final_min(max),
+        final_max(max)
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other.max((T)i); }
+  T operator()(IdxType i) const { return other.max((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct FetchMaxOtherOp : all_op {
-  FetchMaxOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max((T)seg.size() - (T)1),
-    final_min(max), final_max(max)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct FetchMaxOtherOp : all_op
+{
+  FetchMaxOtherOp(T* dcount,
+                  T* hcount,
+                  camp::resources::Resource work_res,
+                  RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max((T)seg.size() - (T)1),
+        final_min(max),
+        final_max(max)
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other.fetch_max((T)i); }
+  T operator()(IdxType i) const { return other.fetch_max((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct MinEqOtherOp : all_op {
-  MinEqOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max((T)seg.size() - (T)1),
-    final_min(min), final_max(min)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct MinEqOtherOp : all_op
+{
+  MinEqOtherOp(T* dcount,
+               T* hcount,
+               camp::resources::Resource work_res,
+               RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max((T)seg.size() - (T)1),
+        final_min(min),
+        final_max(min)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other.min((T)i); }
+  T operator()(IdxType i) const { return other.min((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct FetchMinOtherOp : all_op {
-  FetchMinOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max((T)seg.size()),
-    final_min(min), final_max(min)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct FetchMinOtherOp : all_op
+{
+  FetchMinOtherOp(T* dcount,
+                  T* hcount,
+                  camp::resources::Resource work_res,
+                  RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max((T)seg.size()),
+        final_min(min),
+        final_max(min)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other.fetch_min((T)i); }
+  T operator()(IdxType i) const { return other.fetch_min((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template  < typename ExecPolicy,
-            typename AtomicPolicy,
-            typename IdxType,
-            typename T,
-            template <typename, typename, typename> class OtherOp>
-void
-testAtomicRefMinMaxOp(RAJA::TypedRangeSegment<IdxType> seg, T* count, T* list,
-    T* hcount, T* hlist,
-    camp::resources::Resource work_res, IdxType N)
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename IdxType,
+          typename T,
+          template <typename, typename, typename>
+          class OtherOp>
+void testAtomicRefMinMaxOp(RAJA::TypedRangeSegment<IdxType> seg,
+                           T* count,
+                           T* list,
+                           T* hcount,
+                           T* hlist,
+                           camp::resources::Resource work_res,
+                           IdxType N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      list[i] = otherop.max + (T)1;
-  });
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      T val = otherop(i);
-      list[i] = val;
-  });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
+                           { list[i] = otherop.max + (T)1; });
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             T val   = otherop(i);
+                             list[i] = val;
+                           });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -101,12 +131,13 @@ testAtomicRefMinMaxOp(RAJA::TypedRangeSegment<IdxType> seg, T* count, T* list,
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  work_res.memcpy( hcount, count, sizeof(T) );
-  work_res.memcpy( hlist, list, sizeof(T) * N );
+  work_res.memcpy(hcount, count, sizeof(T));
+  work_res.memcpy(hlist, list, sizeof(T) * N);
 
   EXPECT_LE(otherop.final_min, hcount[0]);
   EXPECT_GE(otherop.final_max, hcount[0]);
-  for (IdxType i = 0; i < seg.size(); i++) {
+  for (IdxType i = 0; i < seg.size(); i++)
+  {
     EXPECT_LE(otherop.min, hlist[i]);
     EXPECT_GE(otherop.max, hlist[i]);
   }
@@ -118,19 +149,19 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicRefMinMaxTestImpl( IdxType N )
+void ForallAtomicRefMinMaxTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
-  camp::resources::Resource work_res{WORKINGRES()};
+  camp::resources::Resource work_res {WORKINGRES()};
 
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
-  T * count   = work_res.allocate<T>(1);
-  T * list    = work_res.allocate<T>(N);
+  T* count = work_res.allocate<T>(1);
+  T* list  = work_res.allocate<T>(N);
 
-  T * hcount   = host_res.allocate<T>(1);
-  T * hlist    = host_res.allocate<T>(N);
+  T* hcount = host_res.allocate<T>(1);
+  T* hlist  = host_res.allocate<T>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -140,27 +171,26 @@ void ForallAtomicRefMinMaxTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       MaxEqOtherOp   >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       FetchMaxOtherOp>(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       MinEqOtherOp   >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       FetchMinOtherOp>(seg, count, list, hcount, hlist, work_res, N);
-
-  work_res.deallocate( count );
-  work_res.deallocate( list );
-  host_res.deallocate( hcount );
-  host_res.deallocate( hlist );
+  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, MaxEqOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, FetchMaxOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, MinEqOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, FetchMinOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+
+  work_res.deallocate(count);
+  work_res.deallocate(list);
+  host_res.deallocate(hcount);
+  host_res.deallocate(hlist);
 }
 
 
 TYPED_TEST_SUITE_P(ForallAtomicRefMinMaxTest);
 template <typename T>
 class ForallAtomicRefMinMaxTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallAtomicRefMinMaxTest, AtomicRefMinMaxForall)
 {
@@ -170,10 +200,9 @@ TYPED_TEST_P(ForallAtomicRefMinMaxTest, AtomicRefMinMaxForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicRefMinMaxTestImpl<AExec, APol, ResType, IdxType, DType>( 10000 );
+  ForallAtomicRefMinMaxTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefMinMaxTest,
-                            AtomicRefMinMaxForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefMinMaxTest, AtomicRefMinMaxForall);
 
 #endif  //__TEST_FORALL_ATOMICREF_MINMAX_HPP__
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefSub.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefSub.hpp
index f4579fb786..bf15327085 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefSub.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefSub.hpp
@@ -6,96 +6,118 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing basic functional tests for subtraction arithmetic atomic operations using forall
+/// Source file containing basic functional tests for subtraction arithmetic
+/// atomic operations using forall
 ///
 
 #ifndef __TEST_FORALL_ATOMICREF_SUB_HPP__
 #define __TEST_FORALL_ATOMICREF_SUB_HPP__
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct PreDecCountOp {
-  PreDecCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)0)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct PreDecCountOp
+{
+  PreDecCountOp(T* dcount,
+                T* hcount,
+                camp::resources::Resource work_res,
+                RAJA::TypedRangeSegment<IdxType> seg)
+      : counter(dcount), min((T)0), max((T)seg.size() - (T)1), final((T)0)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
-      return (--counter);
-    }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (--counter); }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct PostDecCountOp {
-  PostDecCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)0)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct PostDecCountOp
+{
+  PostDecCountOp(T* dcount,
+                 T* hcount,
+                 camp::resources::Resource work_res,
+                 RAJA::TypedRangeSegment<IdxType> seg)
+      : counter(dcount), min((T)0), max((T)seg.size() - (T)1), final((T)0)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
-      return (counter--) - (T)1;
-    }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (counter--) - (T)1; }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct SubEqCountOp {
-  SubEqCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)0)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct SubEqCountOp
+{
+  SubEqCountOp(T* dcount,
+               T* hcount,
+               camp::resources::Resource work_res,
+               RAJA::TypedRangeSegment<IdxType> seg)
+      : counter(dcount), min((T)0), max((T)seg.size() - (T)1), final((T)0)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
-      return (counter -= (T)1);
-    }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (counter -= (T)1); }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct FetchSubCountOp {
-  FetchSubCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)0)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct FetchSubCountOp
+{
+  FetchSubCountOp(T* dcount,
+                  T* hcount,
+                  camp::resources::Resource work_res,
+                  RAJA::TypedRangeSegment<IdxType> seg)
+      : counter(dcount), min((T)0), max((T)seg.size() - (T)1), final((T)0)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
-      return counter.fetch_sub((T)1) - (T)1;
-    }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const
+  {
+    return counter.fetch_sub((T)1) - (T)1;
+  }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
 template <typename ExecPolicy,
-         typename AtomicPolicy,
-         typename IdxType,
-         typename T,
-         template <typename, typename, typename> class CountOp>
+          typename AtomicPolicy,
+          typename IdxType,
+          typename T,
+          template <typename, typename, typename>
+          class CountOp>
 void testAtomicRefSub(RAJA::TypedRangeSegment<IdxType> seg,
-    T* count, T* list, bool* hit,
-    T* hcount, T* hlist, bool* hhit,
-    camp::resources::Resource work_res, IdxType N)
+                      T* count,
+                      T* list,
+                      bool* hit,
+                      T* hcount,
+                      T* hlist,
+                      bool* hhit,
+                      camp::resources::Resource work_res,
+                      IdxType N)
 {
   CountOp<T, AtomicPolicy, IdxType> countop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      list[i] = countop.max + (T)1;
-      hit[i] = false;
-      });
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      T val = countop(i);
-      list[i] = val;
-      hit[(IdxType)val] = true;
-      });
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             list[i] = countop.max + (T)1;
+                             hit[i]  = false;
+                           });
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             T val             = countop(i);
+                             list[i]           = val;
+                             hit[(IdxType)val] = true;
+                           });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -103,12 +125,13 @@ void testAtomicRefSub(RAJA::TypedRangeSegment<IdxType> seg,
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  work_res.memcpy( hcount, count, sizeof(T) );
-  work_res.memcpy( hlist, list, sizeof(T) * N );
-  work_res.memcpy( hhit, hit, sizeof(bool) * N );
+  work_res.memcpy(hcount, count, sizeof(T));
+  work_res.memcpy(hlist, list, sizeof(T) * N);
+  work_res.memcpy(hhit, hit, sizeof(bool) * N);
 
   EXPECT_EQ(countop.final, hcount[0]);
-  for (IdxType i = 0; i < seg.size(); i++) {
+  for (IdxType i = 0; i < seg.size(); i++)
+  {
     EXPECT_LE(countop.min, hlist[i]);
     EXPECT_GE(countop.max, hlist[i]);
     EXPECT_TRUE(hhit[i]);
@@ -121,21 +144,21 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicRefSubTestImpl( IdxType N )
+void ForallAtomicRefSubTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
-  camp::resources::Resource work_res{WORKINGRES()};
+  camp::resources::Resource work_res {WORKINGRES()};
 
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
-  T * count   = work_res.allocate<T>(1);
-  T * list    = work_res.allocate<T>(N);
-  bool * hit  = work_res.allocate<bool>(N);
+  T* count  = work_res.allocate<T>(1);
+  T* list   = work_res.allocate<T>(N);
+  bool* hit = work_res.allocate<bool>(N);
 
-  T * hcount   = host_res.allocate<T>(1);
-  T * hlist    = host_res.allocate<T>(N);
-  bool * hhit  = host_res.allocate<bool>(N);
+  T* hcount  = host_res.allocate<T>(1);
+  T* hlist   = host_res.allocate<T>(N);
+  bool* hhit = host_res.allocate<bool>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -145,29 +168,28 @@ void ForallAtomicRefSubTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, 
-                     PreDecCountOp  >(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, 
-                     PostDecCountOp >(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, 
-                     SubEqCountOp   >(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, 
-                     FetchSubCountOp>(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-
-  work_res.deallocate( count );
-  work_res.deallocate( list );
-  work_res.deallocate( hit );
-  host_res.deallocate( hcount );
-  host_res.deallocate( hlist );
-  host_res.deallocate( hhit ); 
+  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, PreDecCountOp>(
+      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, PostDecCountOp>(
+      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, SubEqCountOp>(
+      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, FetchSubCountOp>(
+      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+
+  work_res.deallocate(count);
+  work_res.deallocate(list);
+  work_res.deallocate(hit);
+  host_res.deallocate(hcount);
+  host_res.deallocate(hlist);
+  host_res.deallocate(hhit);
 }
 
 
 TYPED_TEST_SUITE_P(ForallAtomicRefSubTest);
 template <typename T>
 class ForallAtomicRefSubTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallAtomicRefSubTest, AtomicRefSubForall)
 {
@@ -177,10 +199,9 @@ TYPED_TEST_P(ForallAtomicRefSubTest, AtomicRefSubForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicRefSubTestImpl<AExec, APol, ResType, IdxType, DType>( 10000 );
+  ForallAtomicRefSubTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefSubTest,
-                            AtomicRefSubForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefSubTest, AtomicRefSubForall);
 
 #endif  //__TEST_FORALL_ATOMICREF_SUB_HPP__
diff --git a/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp b/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp
index fc67162823..c71c363d75 100644
--- a/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp
+++ b/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp
@@ -6,7 +6,8 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Header file containing basic functional tests for atomic operations with forall and views.
+/// Header file containing basic functional tests for atomic operations with
+/// forall and views.
 ///
 
 #ifndef __TEST_FORALL_ATOMIC_MULTIVIEW_HPP__
@@ -19,25 +20,26 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicMultiViewTestImpl( IdxType N )
+void ForallAtomicMultiViewTestImpl(IdxType N)
 {
   // Functionally similar to ForallAtomicViewTestImpl
 
-  int dst_side = static_cast<int>( std::sqrt( static_cast<double>(N/2) ) ); // dest[] dimension
-  int src_side = dst_side*2; // source[] dimension
+  int dst_side = static_cast<int>(
+      std::sqrt(static_cast<double>(N / 2)));  // dest[] dimension
+  int src_side = dst_side * 2;                 // source[] dimension
 
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
   RAJA::TypedRangeSegment<IdxType> seg_dstside(0, dst_side);
   RAJA::TypedRangeSegment<IdxType> seg_srcside(0, src_side);
 
-  camp::resources::Resource work_res{WORKINGRES()};
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource work_res {WORKINGRES()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
-  T *  actualsource = work_res.allocate<T> (N);
-  T ** source       = work_res.allocate<T*>(src_side);
-  T *  actualdest   = work_res.allocate<T> (N/2);
-  T ** dest         = work_res.allocate<T*>(dst_side);
-  T *  check_array  = host_res.allocate<T> (N/2);
+  T* actualsource = work_res.allocate<T>(N);
+  T** source      = work_res.allocate<T*>(src_side);
+  T* actualdest   = work_res.allocate<T>(N / 2);
+  T** dest        = work_res.allocate<T*>(dst_side);
+  T* check_array  = host_res.allocate<T>(N / 2);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -49,19 +51,14 @@ void ForallAtomicMultiViewTestImpl( IdxType N )
 
   // assumes each source[] will be 2x size of each dest[], src_side x dst_side
   RAJA::forall<ExecPolicy>(seg_srcside, [=] RAJA_HOST_DEVICE(IdxType ii)
-  {
-    source[ii] = actualsource+(ii*dst_side);
-  });
+                           { source[ii] = actualsource + (ii * dst_side); });
 
   // assumes each dest[] will be a square matrix, dst_side x dst_side
   RAJA::forall<ExecPolicy>(seg_dstside, [=] RAJA_HOST_DEVICE(IdxType ii)
-  {
-    dest[ii] = actualdest+(ii*dst_side);
-  });
+                           { dest[ii] = actualdest + (ii * dst_side); });
 
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-    actualsource[i] = (T)1;
-  });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
+                           { actualsource[i] = (T)1; });
 
   // use atomic add to reduce the array
   // 1D defaut MultiView
@@ -73,22 +70,27 @@ void ForallAtomicMultiViewTestImpl( IdxType N )
 
 
   // Zero out dest using atomic MultiView
-  RAJA::forall<ExecPolicy>(seg_dstside, [=] RAJA_HOST_DEVICE(IdxType i) {
-    for ( int aopidx = 0; aopidx < dst_side; ++aopidx )
-    {
-      sum_atomic_view(i,aopidx) = (T)0;
-    }
-  });
+  RAJA::forall<ExecPolicy>(seg_dstside,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             for (int aopidx = 0; aopidx < dst_side; ++aopidx)
+                             {
+                               sum_atomic_view(i, aopidx) = (T)0;
+                             }
+                           });
 
   // Assign values to dest using atomic MultiView
-  RAJA::forall<ExecPolicy>(seg_srcside, [=] RAJA_HOST_DEVICE(IdxType i) {
-    for ( int aopidx = 0; aopidx < dst_side; ++aopidx )
-    {
-      sum_atomic_view(i/2, aopidx) += vec_view(aopidx,i/2);
-    }
-  });
-
-  work_res.memcpy( check_array, actualdest, sizeof(T) * N/2 );
+  RAJA::forall<ExecPolicy>(seg_srcside,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             for (int aopidx = 0; aopidx < dst_side; ++aopidx)
+                             {
+                               sum_atomic_view(i / 2, aopidx) +=
+                                   vec_view(aopidx, i / 2);
+                             }
+                           });
+
+  work_res.memcpy(check_array, actualdest, sizeof(T) * N / 2);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -98,22 +100,22 @@ void ForallAtomicMultiViewTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  for (IdxType i = 0; i < N / 2; ++i) {
+  for (IdxType i = 0; i < N / 2; ++i)
+  {
     EXPECT_EQ((T)2, check_array[i]);
   }
 
-  work_res.deallocate( actualsource );
-  work_res.deallocate( source );
-  work_res.deallocate( actualdest );
-  work_res.deallocate( dest );
-  host_res.deallocate( check_array );
+  work_res.deallocate(actualsource);
+  work_res.deallocate(source);
+  work_res.deallocate(actualdest);
+  work_res.deallocate(dest);
+  host_res.deallocate(check_array);
 }
 
 TYPED_TEST_SUITE_P(ForallAtomicMultiViewTest);
 template <typename T>
 class ForallAtomicMultiViewTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallAtomicMultiViewTest, AtomicMultiViewForall)
 {
@@ -123,10 +125,9 @@ TYPED_TEST_P(ForallAtomicMultiViewTest, AtomicMultiViewForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicMultiViewTestImpl<AExec, APol, ResType, IdxType, DType>( 20000 );
+  ForallAtomicMultiViewTestImpl<AExec, APol, ResType, IdxType, DType>(20000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicMultiViewTest,
-                            AtomicMultiViewForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicMultiViewTest, AtomicMultiViewForall);
 
 #endif  //__TEST_FORALL_ATOMIC_MULTIVIEW_HPP__
diff --git a/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp b/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp
index a33c0f591a..c066673e4a 100644
--- a/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp
+++ b/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp
@@ -6,7 +6,8 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Header file containing basic functional tests for atomic operations with forall and views.
+/// Header file containing basic functional tests for atomic operations with
+/// forall and views.
 ///
 
 #ifndef __TEST_FORALL_ATOMICOUTOFBOUNDS_MULTIVIEW_HPP__
@@ -19,25 +20,26 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicOutOfBoundsMultiViewTestImpl( IdxType N )
+void ForallAtomicOutOfBoundsMultiViewTestImpl(IdxType N)
 {
   // Functionally similar to ForallAtomicViewTestImpl
 
-  int dst_side = static_cast<int>( std::sqrt( static_cast<double>(N/2) ) ); // dest[] dimension
-  int src_side = dst_side*2; // source[] dimension
+  int dst_side = static_cast<int>(
+      std::sqrt(static_cast<double>(N / 2)));  // dest[] dimension
+  int src_side = dst_side * 2;                 // source[] dimension
 
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
   RAJA::TypedRangeSegment<IdxType> seg_dstside(0, dst_side);
   RAJA::TypedRangeSegment<IdxType> seg_srcside(0, src_side);
 
-  camp::resources::Resource work_res{WORKINGRES()};
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource work_res {WORKINGRES()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
-  T *  actualsource = work_res.allocate<T> (N);
-  T ** source       = work_res.allocate<T*>(src_side);
-  T *  actualdest   = work_res.allocate<T> (N/2);
-  T ** dest         = work_res.allocate<T*>(dst_side);
-  T *  check_array  = host_res.allocate<T> (N/2);
+  T* actualsource = work_res.allocate<T>(N);
+  T** source      = work_res.allocate<T*>(src_side);
+  T* actualdest   = work_res.allocate<T>(N / 2);
+  T** dest        = work_res.allocate<T*>(dst_side);
+  T* check_array  = host_res.allocate<T>(N / 2);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -58,12 +60,12 @@ void ForallAtomicOutOfBoundsMultiViewTestImpl( IdxType N )
   auto sum_atomic_view = RAJA::make_atomic_view<AtomicPolicy>(sum_view);
 
 
-  // Need gtest death test to avoid complete failure due to eventual seg fault
-  #if defined(RAJA_ENABLE_TARGET_OPENMP)
-  EXPECT_DEATH_IF_SUPPORTED( (sum_atomic_view(0,-1) = (T)0), "" );
-  #else
-  EXPECT_THROW( (sum_atomic_view(0,-1) = (T)0), std::runtime_error );
-  #endif
+// Need gtest death test to avoid complete failure due to eventual seg fault
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+  EXPECT_DEATH_IF_SUPPORTED((sum_atomic_view(0, -1) = (T)0), "");
+#else
+  EXPECT_THROW((sum_atomic_view(0, -1) = (T)0), std::runtime_error);
+#endif
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -73,20 +75,20 @@ void ForallAtomicOutOfBoundsMultiViewTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  work_res.deallocate( actualsource );
-  work_res.deallocate( source );
-  work_res.deallocate( actualdest );
-  work_res.deallocate( dest );
-  host_res.deallocate( check_array );
+  work_res.deallocate(actualsource);
+  work_res.deallocate(source);
+  work_res.deallocate(actualdest);
+  work_res.deallocate(dest);
+  host_res.deallocate(check_array);
 }
 
 TYPED_TEST_SUITE_P(ForallAtomicOutOfBoundsMultiViewTest);
 template <typename T>
 class ForallAtomicOutOfBoundsMultiViewTest : public ::testing::Test
-{
-};
+{};
 
-TYPED_TEST_P(ForallAtomicOutOfBoundsMultiViewTest, AtomicOutOfBoundsMultiViewForall)
+TYPED_TEST_P(ForallAtomicOutOfBoundsMultiViewTest,
+             AtomicOutOfBoundsMultiViewForall)
 {
   using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
   using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -94,7 +96,8 @@ TYPED_TEST_P(ForallAtomicOutOfBoundsMultiViewTest, AtomicOutOfBoundsMultiViewFor
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicOutOfBoundsMultiViewTestImpl<AExec, APol, ResType, IdxType, DType>( 20000 );
+  ForallAtomicOutOfBoundsMultiViewTestImpl<AExec, APol, ResType, IdxType,
+                                           DType>(20000);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallAtomicOutOfBoundsMultiViewTest,
diff --git a/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp b/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp
index 588e95bf82..325fba2a0a 100644
--- a/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp
+++ b/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp
@@ -6,7 +6,8 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Header file containing basic functional tests for atomic operations with forall and views.
+/// Header file containing basic functional tests for atomic operations with
+/// forall and views.
 ///
 
 #ifndef __TEST_FORALL_ATOMIC_VIEW_HPP__
@@ -17,18 +18,18 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicViewTestImpl( IdxType N )
+void ForallAtomicViewTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
   RAJA::TypedRangeSegment<IdxType> seg_half(0, N / 2);
 
-  camp::resources::Resource work_res{WORKINGRES()};
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource work_res {WORKINGRES()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
-  T * hsource = host_res.allocate<T>(N);
-  T * source = work_res.allocate<T>(N);
-  T * dest = work_res.allocate<T>(N/2);
-  T * check_array = host_res.allocate<T>(N/2);
+  T* hsource     = host_res.allocate<T>(N);
+  T* source      = work_res.allocate<T>(N);
+  T* dest        = work_res.allocate<T>(N / 2);
+  T* check_array = host_res.allocate<T>(N / 2);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -38,10 +39,9 @@ void ForallAtomicViewTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  RAJA::forall<RAJA::seq_exec>(seg,
-                               [=](IdxType i) { hsource[i] = (T)1; });
+  RAJA::forall<RAJA::seq_exec>(seg, [=](IdxType i) { hsource[i] = (T)1; });
 
-  work_res.memcpy( source, hsource, sizeof(T) * N );
+  work_res.memcpy(source, hsource, sizeof(T) * N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -59,16 +59,14 @@ void ForallAtomicViewTestImpl( IdxType N )
 
 
   // Zero out dest using atomic view
-  RAJA::forall<ExecPolicy>(seg_half, [=] RAJA_HOST_DEVICE(IdxType i) {
-    sum_atomic_view(i) = (T)0;
-  });
+  RAJA::forall<ExecPolicy>(seg_half, [=] RAJA_HOST_DEVICE(IdxType i)
+                           { sum_atomic_view(i) = (T)0; });
 
   // Assign values to dest using atomic view
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-    sum_atomic_view(i / 2) += vec_view(i);
-  });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
+                           { sum_atomic_view(i / 2) += vec_view(i); });
 
-  work_res.memcpy( check_array, dest, sizeof(T) * N/2 );
+  work_res.memcpy(check_array, dest, sizeof(T) * N / 2);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -78,21 +76,21 @@ void ForallAtomicViewTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  for (IdxType i = 0; i < N / 2; ++i) {
+  for (IdxType i = 0; i < N / 2; ++i)
+  {
     EXPECT_EQ((T)2, check_array[i]);
   }
 
-  host_res.deallocate( hsource );
-  work_res.deallocate( source );
-  work_res.deallocate( dest );
-  host_res.deallocate( check_array );
+  host_res.deallocate(hsource);
+  work_res.deallocate(source);
+  work_res.deallocate(dest);
+  host_res.deallocate(check_array);
 }
 
 TYPED_TEST_SUITE_P(ForallAtomicViewTest);
 template <typename T>
 class ForallAtomicViewTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallAtomicViewTest, AtomicViewForall)
 {
@@ -102,10 +100,9 @@ TYPED_TEST_P(ForallAtomicViewTest, AtomicViewForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicViewTestImpl<AExec, APol, ResType, IdxType, DType>( 100000 );
+  ForallAtomicViewTestImpl<AExec, APol, ResType, IdxType, DType>(100000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicViewTest,
-                            AtomicViewForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicViewTest, AtomicViewForall);
 
 #endif  //__TEST_FORALL_ATOMIC_VIEW_HPP__
diff --git a/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp b/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp
index 26bd5ee7d9..ae4cbcfb09 100644
--- a/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp
+++ b/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp
@@ -23,62 +23,58 @@ void ForallIcountIndexSetViewTestImpl()
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
   using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
 
-  using IndexSetType = 
-   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
+  using IndexSetType =
+      RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   IndexSetType iset;
-  std::vector<INDEX_TYPE> is_indices; 
+  std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
-    iset, is_indices, working_res);
+      iset, is_indices, working_res);
 
   //
   // Working array length
   //
-  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
+  const INDEX_TYPE N = is_indices[is_indices.size() - 1] + 1;
 
   //
   // Allocate and initialize arrays used in testing
-  //  
+  //
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
   INDEX_TYPE ticount = 0;
-  for (size_t i = 0; i < is_indices.size(); ++i) {
-    test_array[ ticount++ ] = is_indices[i];
+  for (size_t i = 0; i < is_indices.size(); ++i)
+  {
+    test_array[ticount++] = is_indices[i];
   }
 
   RAJA::Layout<1> layout(N);
-  RAJA::View< INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0> >
-    work_view(working_array, layout);
+  RAJA::View<INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0>> work_view(
+      working_array, layout);
 
-  RAJA::forall_Icount<EXEC_POLICY>(iset,
-    [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx) {
-    work_view( icount ) = idx;
-  });
+  RAJA::forall_Icount<EXEC_POLICY>(
+      iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx)
+      { work_view(icount) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  // 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  //
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -86,8 +82,7 @@ void ForallIcountIndexSetViewTestImpl()
 TYPED_TEST_SUITE_P(ForallIcountIndexSetViewTest);
 template <typename T>
 class ForallIcountIndexSetViewTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallIcountIndexSetViewTest, IndexSetForallIcountView)
 {
diff --git a/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp b/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp
index 7fc00c47d9..783bffa5fb 100644
--- a/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp
+++ b/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp
@@ -22,61 +22,57 @@ void ForallIndexSetViewTestImpl()
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
   using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
 
-  using IndexSetType = 
-   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
+  using IndexSetType =
+      RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
-  IndexSetType iset; 
+  IndexSetType iset;
   std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
-    iset, is_indices, working_res);
+      iset, is_indices, working_res);
 
   //
   // Working array length
   //
-  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
+  const INDEX_TYPE N = is_indices[is_indices.size() - 1] + 1;
 
   //
   // Allocate and initialize arrays used in testing
-  //  
+  //
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
-  for (size_t i = 0; i < is_indices.size(); ++i) {
-    test_array[ is_indices[i] ] = is_indices[i];
+  for (size_t i = 0; i < is_indices.size(); ++i)
+  {
+    test_array[is_indices[i]] = is_indices[i];
   }
 
-  using view_type = RAJA::View< INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0> >;
+  using view_type = RAJA::View<INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0>>;
 
   RAJA::Layout<1> layout(N);
   view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    working_array[idx] = idx;
-  });
+  RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            { working_array[idx] = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  // 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  //
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -84,8 +80,7 @@ void ForallIndexSetViewTestImpl()
 TYPED_TEST_SUITE_P(ForallIndexSetViewTest);
 template <typename T>
 class ForallIndexSetViewTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallIndexSetViewTest, IndexSetForallView)
 {
@@ -96,7 +91,6 @@ TYPED_TEST_P(ForallIndexSetViewTest, IndexSetForallView)
   ForallIndexSetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetViewTest,
-                            IndexSetForallView);
+REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetViewTest, IndexSetForallView);
 
 #endif  // __TEST_FORALL_INDEXSET_VIEW_HPP__
diff --git a/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp b/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp
index 70fbb98b15..cd29d25073 100644
--- a/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp
+++ b/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp
@@ -21,57 +21,53 @@ void ForallIcountIndexSetTestImpl()
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
   using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
 
-  using IndexSetType = 
-   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
+  using IndexSetType =
+      RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
-  IndexSetType iset; 
-  std::vector<INDEX_TYPE> is_indices; 
+  IndexSetType iset;
+  std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
-    iset, is_indices, working_res);
+      iset, is_indices, working_res);
 
   //
   // Working array length
   //
-  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
+  const INDEX_TYPE N = is_indices[is_indices.size() - 1] + 1;
 
   //
   // Allocate and initialize arrays used in testing
-  //  
+  //
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
   INDEX_TYPE ticount = 0;
-  for (size_t i = 0; i < is_indices.size(); ++i) {
-    test_array[ ticount++ ] = is_indices[i];
+  for (size_t i = 0; i < is_indices.size(); ++i)
+  {
+    test_array[ticount++] = is_indices[i];
   }
 
   RAJA::forall_Icount(EXEC_POLICY(), iset,
-    [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx) {
-    working_array[icount] = idx;
-  });
+                      [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx)
+                      { working_array[icount] = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -79,8 +75,7 @@ void ForallIcountIndexSetTestImpl()
 TYPED_TEST_SUITE_P(ForallIcountIndexSetTest);
 template <typename T>
 class ForallIcountIndexSetTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallIcountIndexSetTest, IndexSetForallIcount)
 {
@@ -91,7 +86,6 @@ TYPED_TEST_P(ForallIcountIndexSetTest, IndexSetForallIcount)
   ForallIcountIndexSetTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallIcountIndexSetTest,
-                            IndexSetForallIcount);
+REGISTER_TYPED_TEST_SUITE_P(ForallIcountIndexSetTest, IndexSetForallIcount);
 
 #endif  // __TEST_FORALL_ICOUNT_INDEXSET_HPP__
diff --git a/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp b/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp
index f2be845482..9decd9ae7e 100644
--- a/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp
+++ b/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp
@@ -20,56 +20,53 @@ void ForallIndexSetTestImpl()
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
   using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
 
-  using IndexSetType = 
-   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
+  using IndexSetType =
+      RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
-  IndexSetType iset; 
-  std::vector<INDEX_TYPE> is_indices; 
+  IndexSetType iset;
+  std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
-    iset, is_indices, working_res);
+      iset, is_indices, working_res);
 
   //
   // Working array length
   //
-  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
+  const INDEX_TYPE N = is_indices[is_indices.size() - 1] + 1;
 
   //
   // Allocate and initialize arrays used in testing
-  //  
+  //
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
-  for (size_t i = 0; i < is_indices.size(); ++i) {
-    test_array[ is_indices[i] ] = is_indices[i];
+  for (size_t i = 0; i < is_indices.size(); ++i)
+  {
+    test_array[is_indices[i]] = is_indices[i];
   }
 
-  RAJA::forall(EXEC_POLICY(), iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    working_array[idx] = idx;
-  });
+  RAJA::forall(EXEC_POLICY(), iset,
+               [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+               { working_array[idx] = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  // 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  //
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -77,8 +74,7 @@ void ForallIndexSetTestImpl()
 TYPED_TEST_SUITE_P(ForallIndexSetTest);
 template <typename T>
 class ForallIndexSetTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallIndexSetTest, IndexSetForall)
 {
@@ -89,7 +85,6 @@ TYPED_TEST_P(ForallIndexSetTest, IndexSetForall)
   ForallIndexSetTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetTest,
-                            IndexSetForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetTest, IndexSetForall);
 
 #endif  // __TEST_FORALL_INDEXSET_HPP__
diff --git a/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp b/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp
index c783befdf4..b3c33c97f9 100644
--- a/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp
+++ b/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp
@@ -15,9 +15,13 @@
 #include <random>
 #include <type_traits>
 
-template <typename EXEC_POLICY, typename REDUCE_POLICY, typename ABSTRACTION,
-          typename DATA_TYPE, typename IDX_TYPE,
-          typename SEG_TYPE, typename Container,
+template <typename EXEC_POLICY,
+          typename REDUCE_POLICY,
+          typename ABSTRACTION,
+          typename DATA_TYPE,
+          typename IDX_TYPE,
+          typename SEG_TYPE,
+          typename Container,
           typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 // add bool return type to disambiguate signatures of these functions for MSVC
@@ -27,11 +31,17 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE&,
                                const std::vector<IDX_TYPE>&,
                                camp::resources::Resource,
                                RandomGenerator&)
-{ return false; }
+{
+  return false;
+}
 ///
-template <typename EXEC_POLICY, typename REDUCE_POLICY, typename ABSTRACTION,
-          typename DATA_TYPE, typename IDX_TYPE,
-          typename SEG_TYPE, typename Container,
+template <typename EXEC_POLICY,
+          typename REDUCE_POLICY,
+          typename ABSTRACTION,
+          typename DATA_TYPE,
+          typename IDX_TYPE,
+          typename SEG_TYPE,
+          typename Container,
           typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 std::enable_if_t<ABSTRACTION::template supports<DATA_TYPE>()>
@@ -41,12 +51,13 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
                                camp::resources::Resource working_res,
                                RandomGenerator& rngen)
 {
-  using MULTIREDUCER = typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
+  using MULTIREDUCER =
+      typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
 
   const IDX_TYPE idx_range = seg_idx[seg_idx.size() - 1] + 1;
-  const IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  const IDX_TYPE idx_len   = static_cast<IDX_TYPE>(seg_idx.size());
 
-  const int modval = 100;
+  const int modval      = 100;
   const size_t num_bins = multi_init.size();
 
   IDX_TYPE* working_range;
@@ -63,47 +74,44 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
 
   IDX_TYPE data_len = 0;
 
-  allocateForallTestData(idx_range+1,
-                         working_res,
-                         &working_range,
-                         &check_range,
-                         &test_range);
+  allocateForallTestData(idx_range + 1, working_res, &working_range,
+                         &check_range, &test_range);
 
-  for (IDX_TYPE i = 0; i < idx_range+1; ++i) {
+  for (IDX_TYPE i = 0; i < idx_range + 1; ++i)
+  {
     test_range[i] = ~IDX_TYPE(0);
   }
 
   {
-    std::uniform_int_distribution<IDX_TYPE> work_per_iterate_distribution(0, num_bins);
+    std::uniform_int_distribution<IDX_TYPE> work_per_iterate_distribution(
+        0, num_bins);
 
-    for (IDX_TYPE i = 0; i < idx_len; ++i) {
-      IDX_TYPE idx = seg_idx[i];
+    for (IDX_TYPE i = 0; i < idx_len; ++i)
+    {
+      IDX_TYPE idx    = seg_idx[i];
       test_range[idx] = data_len;
       data_len += work_per_iterate_distribution(rngen);
-      test_range[idx+1] = data_len;
+      test_range[idx + 1] = data_len;
     }
   }
 
-  allocateForallTestData(data_len,
-                         working_res,
-                         &working_array,
-                         &check_array,
+  allocateForallTestData(data_len, working_res, &working_array, &check_array,
                          &test_array);
 
-  allocateForallTestData(data_len,
-                         working_res,
-                         &working_bins,
-                         &check_bins,
+  allocateForallTestData(data_len, working_res, &working_bins, &check_bins,
                          &test_bins);
 
-  if (data_len > IDX_TYPE(0)) {
+  if (data_len > IDX_TYPE(0))
+  {
 
-    // use ints to initialize array here to avoid floating point precision issues
-    std::uniform_int_distribution<int> array_int_distribution(0, modval-1);
-    std::uniform_int_distribution<IDX_TYPE> bin_distribution(0, num_bins-1);
+    // use ints to initialize array here to avoid floating point precision
+    // issues
+    std::uniform_int_distribution<int> array_int_distribution(0, modval - 1);
+    std::uniform_int_distribution<IDX_TYPE> bin_distribution(0, num_bins - 1);
 
 
-    for (IDX_TYPE i = 0; i < data_len; ++i) {
+    for (IDX_TYPE i = 0; i < data_len; ++i)
+    {
       test_array[i] = DATA_TYPE(array_int_distribution(rngen));
 
       // this may use the same bin multiple times per iterate
@@ -111,7 +119,8 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
     }
   }
 
-  working_res.memcpy(working_range, test_range, sizeof(IDX_TYPE) * (idx_range+1));
+  working_res.memcpy(working_range, test_range,
+                     sizeof(IDX_TYPE) * (idx_range + 1));
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
   working_res.memcpy(working_bins, test_bins, sizeof(IDX_TYPE) * data_len);
 
@@ -123,19 +132,27 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
   {
     std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
 
-    for (IDX_TYPE i = 0; i < data_len; ++i) {
-      ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+    for (IDX_TYPE i = 0; i < data_len; ++i)
+    {
+      ref_vals[test_bins[i]] =
+          ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
     }
 
-    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE ii) {
-      for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
-        ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
-        ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]);
-      }
-    });
+    RAJA::forall<EXEC_POLICY>(
+        seg,
+        [=] RAJA_HOST_DEVICE(IDX_TYPE ii)
+        {
+          for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
+               ++idx)
+          {
+            ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
+            ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]);
+          }
+        });
 
     size_t bin = 0;
-    for (auto init_val : multi_init) {
+    for (auto init_val : multi_init)
+    {
       ASSERT_EQ(DATA_TYPE(red[bin].get()), ref_vals[bin]);
       ASSERT_EQ(red2.get(bin), ABSTRACTION::combine(ref_vals[bin], init_val));
       ++bin;
@@ -145,44 +162,59 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
 
   red.reset();
 
-  // basic multiple use test, ensure same reducer can combine values from multiple loops
+  // basic multiple use test, ensure same reducer can combine values from
+  // multiple loops
   {
     std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
 
     const int nloops = 2;
-    for (int j = 0; j < nloops; ++j) {
+    for (int j = 0; j < nloops; ++j)
+    {
 
-      for (IDX_TYPE i = 0; i < data_len; ++i) {
-        ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+      for (IDX_TYPE i = 0; i < data_len; ++i)
+      {
+        ref_vals[test_bins[i]] =
+            ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
       }
 
-      RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE ii) {
-        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
-          ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
-        }
-      });
+      RAJA::forall<EXEC_POLICY>(seg,
+                                [=] RAJA_HOST_DEVICE(IDX_TYPE ii)
+                                {
+                                  for (IDX_TYPE idx = working_range[ii];
+                                       idx < working_range[ii + 1]; ++idx)
+                                  {
+                                    ABSTRACTION::reduce(red[working_bins[idx]],
+                                                        working_array[idx]);
+                                  }
+                                });
     }
 
-    for (size_t bin = 0; bin < num_bins; ++bin) {
+    for (size_t bin = 0; bin < num_bins; ++bin)
+    {
       ASSERT_EQ(static_cast<DATA_TYPE>(red[bin].get()), ref_vals[bin]);
     }
   }
 
 
   // test the consistency of answers, if we expect them to be consistent
-  if (ABSTRACTION::consistent(red)) {
+  if (ABSTRACTION::consistent(red))
+  {
 
-    if /* constexpr */ (std::is_floating_point<DATA_TYPE>::value) {
+    if /* constexpr */ (std::is_floating_point<DATA_TYPE>::value)
+    {
 
       // use floating point values to accentuate floating point precision issues
       std::conditional_t<!std::is_floating_point<DATA_TYPE>::value,
-          std::uniform_int_distribution<DATA_TYPE>,
-          std::uniform_real_distribution<DATA_TYPE>> array_flt_distribution(0, modval-1);
+                         std::uniform_int_distribution<DATA_TYPE>,
+                         std::uniform_real_distribution<DATA_TYPE>>
+          array_flt_distribution(0, modval - 1);
 
-      for (IDX_TYPE i = 0; i < data_len; ++i) {
+      for (IDX_TYPE i = 0; i < data_len; ++i)
+      {
         test_array[i] = DATA_TYPE(array_flt_distribution(rngen));
       }
-      working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
+      working_res.memcpy(working_array, test_array,
+                         sizeof(DATA_TYPE) * data_len);
     }
 
 
@@ -190,21 +222,31 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
     bool got_ref_vals = false;
 
     const int nloops = 2;
-    for (int j = 0; j < nloops; ++j) {
+    for (int j = 0; j < nloops; ++j)
+    {
       red.reset();
 
-      RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE ii) {
-        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
-          ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
-        }
-      });
-
-      if (!got_ref_vals) {
+      RAJA::forall<EXEC_POLICY>(seg,
+                                [=] RAJA_HOST_DEVICE(IDX_TYPE ii)
+                                {
+                                  for (IDX_TYPE idx = working_range[ii];
+                                       idx < working_range[ii + 1]; ++idx)
+                                  {
+                                    ABSTRACTION::reduce(red[working_bins[idx]],
+                                                        working_array[idx]);
+                                  }
+                                });
+
+      if (!got_ref_vals)
+      {
         ref_vals.resize(num_bins);
         red.get_all(ref_vals);
         got_ref_vals = true;
-      } else {
-        for (size_t bin = 0; bin < num_bins; ++bin) {
+      }
+      else
+      {
+        for (size_t bin = 0; bin < num_bins; ++bin)
+        {
           ASSERT_EQ(red.get(bin), ref_vals[bin]);
         }
       }
@@ -212,26 +254,16 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
   }
 
 
-  deallocateForallTestData(working_res,
-                           working_bins,
-                           check_bins,
-                           test_bins);
-  deallocateForallTestData(working_res,
-                           working_array,
-                           check_array,
-                           test_array);
-  deallocateForallTestData(working_res,
-                           working_range,
-                           check_range,
-                           test_range);
+  deallocateForallTestData(working_res, working_bins, check_bins, test_bins);
+  deallocateForallTestData(working_res, working_array, check_array, test_array);
+  deallocateForallTestData(working_res, working_range, check_range, test_range);
 }
 
 
 TYPED_TEST_SUITE_P(ForallMultiReduceBasicTest);
 template <typename T>
 class ForallMultiReduceBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallMultiReduceBasicTest, MultiReduceBasicForall)
 {
@@ -243,10 +275,10 @@ TYPED_TEST_P(ForallMultiReduceBasicTest, MultiReduceBasicForall)
   using ABSTRACTION   = typename camp::at<TypeParam, camp::num<5>>::type;
 
   // for setting random values in arrays
-  auto random_seed = std::random_device{}();
+  auto random_seed = std::random_device {}();
   std::mt19937 rngen(random_seed);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
@@ -254,51 +286,58 @@ TYPED_TEST_P(ForallMultiReduceBasicTest, MultiReduceBasicForall)
 
   std::vector<size_t> num_bins_max_container({0, 1, 100});
   size_t num_bins_min = 0;
-  for (size_t num_bins_max : num_bins_max_container) {
+  for (size_t num_bins_max : num_bins_max_container)
+  {
 
-    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min, num_bins_max);
-    num_bins_min = num_bins_max+1;
+    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min,
+                                                        num_bins_max);
+    num_bins_min    = num_bins_max + 1;
     size_t num_bins = num_bins_dist(rngen);
 
     container.resize(num_bins, DATA_TYPE(2));
 
     // Range segment tests
-    RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+    RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
     RAJA::getIndices(seg_idx, r1);
-    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE>(
-                                   r1, container, seg_idx, working_res, rngen);
+    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
+                                   DATA_TYPE>(r1, container, seg_idx,
+                                              working_res, rngen);
 
     seg_idx.clear();
-    RAJA::TypedRangeSegment<IDX_TYPE> r3( 3, 2060 );
+    RAJA::TypedRangeSegment<IDX_TYPE> r3(3, 2060);
     RAJA::getIndices(seg_idx, r3);
-    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE>(
-                                   r3, container, seg_idx, working_res, rngen);
+    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
+                                   DATA_TYPE>(r3, container, seg_idx,
+                                              working_res, rngen);
 
     // Range-stride segment test
     seg_idx.clear();
-    RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+    RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
     RAJA::getIndices(seg_idx, r5);
-    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE>(
-                                   r5, container, seg_idx, working_res, rngen);
+    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
+                                   DATA_TYPE>(r5, container, seg_idx,
+                                              working_res, rngen);
 
     // List segment test
     seg_idx.clear();
     IDX_TYPE last = 10567;
-    std::uniform_int_distribution<IDX_TYPE> dist(0, last-1);
-    for (IDX_TYPE i = 0; i < last; ++i) {
+    std::uniform_int_distribution<IDX_TYPE> dist(0, last - 1);
+    for (IDX_TYPE i = 0; i < last; ++i)
+    {
       IDX_TYPE randval = dist(rngen);
-      if ( i < randval ) {
+      if (i < randval)
+      {
         seg_idx.push_back(i);
       }
     }
-    RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                         working_res );
-    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE>(
-                                   l1, container, seg_idx, working_res, rngen);
+    RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
+                                        working_res);
+    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
+                                   DATA_TYPE>(l1, container, seg_idx,
+                                              working_res, rngen);
   }
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallMultiReduceBasicTest,
-                            MultiReduceBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallMultiReduceBasicTest, MultiReduceBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCESUM_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp
index 6adade08a9..c4ef3f3188 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp
@@ -13,66 +13,69 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
                                      camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
   //
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
     test_array[i] = 13;
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> simpand(21);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    simpand &= working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { simpand &= working_array[idx]; });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpand.get()), 5);
 
-  
-  // 
+
+  //
   // And now a randomized test that pushes zeros around
-  // 
+  //
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE ref_and = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_and &= test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_and &= test_array[seg_idx[i]];
   }
 
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> redand(0);
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> redand2(2);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    redand  &= working_array[idx];
-    redand2 &= working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            {
+                              redand &= working_array[idx];
+                              redand2 &= working_array[idx];
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
   ASSERT_EQ(static_cast<DATA_TYPE>(redand2.get()), ref_and);
@@ -80,18 +83,16 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
   redand.reset(0);
 
   const int nloops = 3;
-  for (int j = 0; j < nloops; ++j) {
-    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-      redand &= working_array[idx];
-    });
+  for (int j = 0; j < nloops; ++j)
+  {
+    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                              { redand &= working_array[idx]; });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
-   
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
@@ -99,8 +100,7 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(ForallReduceBitAndBasicTest);
 template <typename T>
 class ForallReduceBitAndBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
 {
@@ -110,67 +110,67 @@ TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r1, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r2, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r3, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
+                                                              working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r4, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r5, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                              working_res);
 
   // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    l1, seg_idx, working_res);
+                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                                  REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitAndBasicTest,
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp
index a0db78c4f6..5e783b89e0 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp
@@ -13,67 +13,70 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
-                                     const std::vector<IDX_TYPE>& seg_idx,
-                                     camp::resources::Resource working_res)
+                                    const std::vector<IDX_TYPE>& seg_idx,
+                                    camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
   //
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
     test_array[i] = 9;
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   RAJA::ReduceBitOr<REDUCE_POLICY, DATA_TYPE> simpor(5);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    simpor |= working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { simpor |= working_array[idx]; });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpor.get()), 13);
 
- 
+
   //
   // And now a randomized test that pushes zeros around
   //
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE ref_or = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_or |= test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_or |= test_array[seg_idx[i]];
   }
 
 
   RAJA::ReduceBitOr<REDUCE_POLICY, DATA_TYPE> redor(0);
   RAJA::ReduceBitOr<REDUCE_POLICY, DATA_TYPE> redor2(2);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    redor  |= working_array[idx];
-    redor2 |= working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            {
+                              redor |= working_array[idx];
+                              redor2 |= working_array[idx];
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redor.get()), ref_or);
   ASSERT_EQ(static_cast<DATA_TYPE>(redor2.get()), ref_or);
@@ -81,26 +84,23 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
   redor.reset(0);
 
   const int nloops = 3;
-  for (int j = 0; j < nloops; ++j) {
-    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-      redor |= working_array[idx];
-    });
+  for (int j = 0; j < nloops; ++j)
+  {
+    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                              { redor |= working_array[idx]; });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redor.get()), ref_or);
-   
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest);
 template <typename T>
 class ForallReduceBitOrBasicTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
@@ -111,70 +111,66 @@ TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-    camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r1, seg_idx, working_res);
+                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                                 REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r2, seg_idx, working_res);
+                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                                 REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r3, seg_idx, working_res);
+                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                                 REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r4, seg_idx, working_res);
+                                 EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                             working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r5, seg_idx, working_res);
+                                 EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                             working_res);
 
   // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedListSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   l1, seg_idx, working_res);
+                                 RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                                 REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest,
-                            ReduceBitOrBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCEBITOR_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp
index 5ec8c47164..cb5657cde4 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp
@@ -13,37 +13,38 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
                                   camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
- 
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
+
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
-  const int modval = 100;
+  const int modval         = 100;
   const DATA_TYPE max_init = -1;
-  const DATA_TYPE big_max = modval + 1;
+  const DATA_TYPE big_max  = modval + 1;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_max = max_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_max = RAJA_MAX(test_array[ seg_idx[i] ], ref_max); 
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_max = RAJA_MAX(test_array[seg_idx[i]], ref_max);
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -51,10 +52,12 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
   RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> maxinit(big_max);
   RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> max(max_init);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    maxinit.max( working_array[idx] );
-    max.max( working_array[idx] );
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            {
+                              maxinit.max(working_array[idx]);
+                              max.max(working_array[idx]);
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(maxinit.get()), big_max);
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max);
@@ -63,29 +66,24 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), max_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    max.max( working_array[idx] * factor);
-  });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { max.max(working_array[idx] * factor); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
-   
+
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    max.max( working_array[idx] * factor);
-  });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { max.max(working_array[idx] * factor); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
-   
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest);
 template <typename T>
 class ForallReduceMaxBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
 {
@@ -95,70 +93,66 @@ TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r1, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r2, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r3, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r4, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                           working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r5, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                           working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 l1, seg_idx, working_res);
+                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest,
-                            ReduceMaxBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCEMAX_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp
index c5f228821d..3aaba8daf4 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp
@@ -13,57 +13,63 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE& seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
                                      camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
-  const int modval = 100;
-  const DATA_TYPE max_init = -modval;
+  const int modval           = 100;
+  const DATA_TYPE max_init   = -modval;
   const IDX_TYPE maxloc_init = -1;
-  const IDX_TYPE maxloc_idx = seg_idx[ idx_len * 2/3 ];
-  const DATA_TYPE big_max = modval+1;
-  const IDX_TYPE big_maxloc = maxloc_init;
+  const IDX_TYPE maxloc_idx  = seg_idx[idx_len * 2 / 3];
+  const DATA_TYPE big_max    = modval + 1;
+  const IDX_TYPE big_maxloc  = maxloc_init;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
   test_array[maxloc_idx] = static_cast<DATA_TYPE>(big_max);
 
-  DATA_TYPE ref_max = max_init;
+  DATA_TYPE ref_max   = max_init;
   IDX_TYPE ref_maxloc = maxloc_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    if ( test_array[ seg_idx[i] ] > ref_max ) {
-       ref_max = test_array[ seg_idx[i] ];
-       ref_maxloc = seg_idx[i];
-    } 
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    if (test_array[seg_idx[i]] > ref_max)
+    {
+      ref_max    = test_array[seg_idx[i]];
+      ref_maxloc = seg_idx[i];
+    }
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> maxinit(big_max, maxloc_init);
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max(max_init, maxloc_init);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> maxinit(big_max,
+                                                                 maxloc_init);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max(max_init,
+                                                             maxloc_init);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    maxinit.maxloc( working_array[idx], idx );
-    max.maxloc( working_array[idx], idx );
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            {
+                              maxinit.maxloc(working_array[idx], idx);
+                              max.maxloc(working_array[idx], idx);
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(maxinit.get()), big_max);
   ASSERT_EQ(static_cast<IDX_TYPE>(maxinit.getLoc()), big_maxloc);
@@ -75,31 +81,26 @@ void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE& seg,
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), maxloc_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    max.maxloc( working_array[idx] * factor, idx);
-  });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { max.maxloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
-  
+
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { 
-    max.maxloc( working_array[idx] * factor, idx);
-  });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { max.maxloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
- 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest);
 template <typename T>
 class ForallReduceMaxLocBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
 {
@@ -109,67 +110,67 @@ TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r1, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r2, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r3, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
+                                                              working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r4, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r5, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                              working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    l1, seg_idx, working_res);
+                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                                  REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest,
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp
index 67e051acc4..2d91806ad6 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp
@@ -13,37 +13,38 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
                                   camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
-  const int modval = 100;
-  const DATA_TYPE min_init = modval+1;
+  const int modval          = 100;
+  const DATA_TYPE min_init  = modval + 1;
   const DATA_TYPE small_min = -modval;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_min = min_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_min = RAJA_MIN(test_array[ seg_idx[i] ], ref_min); 
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_min = RAJA_MIN(test_array[seg_idx[i]], ref_min);
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -52,10 +53,12 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
   RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> mininit(small_min);
   RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> min(min_init);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    mininit.min( working_array[idx] );
-    min.min( working_array[idx] );
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            {
+                              mininit.min(working_array[idx]);
+                              min.min(working_array[idx]);
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit.get()), small_min);
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min);
@@ -63,24 +66,20 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
   min.reset(min_init);
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), min_init);
 
-  DATA_TYPE factor = 3; 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    min.min( working_array[idx] * factor);
-  });
+  DATA_TYPE factor = 3;
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { min.min(working_array[idx] * factor); });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
 
   factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { 
-    min.min( working_array[idx] * factor);
-  });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { min.min(working_array[idx] * factor); });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
-   
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
@@ -88,8 +87,7 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(ForallReduceMinBasicTest);
 template <typename T>
 class ForallReduceMinBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
 {
@@ -99,70 +97,66 @@ TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r1, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r2, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r3, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r4, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                           working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r5, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                           working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 l1, seg_idx, working_res);
+                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinBasicTest,
-                            ReduceMinBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinBasicTest, ReduceMinBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCEMIN_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp
index be5265d4b1..58e679cfe5 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp
@@ -13,57 +13,63 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMinLocBasicTestImpl(const SEG_TYPE& seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
                                      camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
-
-  const int modval = 100;
-  const DATA_TYPE min_init = modval+1;
-  const IDX_TYPE minloc_init = -1;
-  const IDX_TYPE minloc_idx = seg_idx[ idx_len * 2/3 ];
-  const DATA_TYPE small_min = -modval;
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
+
+  const int modval            = 100;
+  const DATA_TYPE min_init    = modval + 1;
+  const IDX_TYPE minloc_init  = -1;
+  const IDX_TYPE minloc_idx   = seg_idx[idx_len * 2 / 3];
+  const DATA_TYPE small_min   = -modval;
   const IDX_TYPE small_minloc = minloc_init;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
   test_array[minloc_idx] = static_cast<DATA_TYPE>(small_min);
 
-  DATA_TYPE ref_min = min_init;
+  DATA_TYPE ref_min   = min_init;
   IDX_TYPE ref_minloc = minloc_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    if ( test_array[ seg_idx[i] ] < ref_min ) {
-       ref_min = test_array[ seg_idx[i] ];
-       ref_minloc = seg_idx[i];
-    } 
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    if (test_array[seg_idx[i]] < ref_min)
+    {
+      ref_min    = test_array[seg_idx[i]];
+      ref_minloc = seg_idx[i];
+    }
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
 
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> mininit(small_min, minloc_init);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min(min_init, minloc_init);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> mininit(small_min,
+                                                                 minloc_init);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min(min_init,
+                                                             minloc_init);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    mininit.minloc( working_array[idx], idx );
-    min.minloc( working_array[idx], idx );
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            {
+                              mininit.minloc(working_array[idx], idx);
+                              min.minloc(working_array[idx], idx);
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit.get()), small_min);
   ASSERT_EQ(static_cast<IDX_TYPE>(mininit.getLoc()), small_minloc);
@@ -75,31 +81,26 @@ void ForallReduceMinLocBasicTestImpl(const SEG_TYPE& seg,
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), minloc_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    min.minloc( working_array[idx] * factor, idx);
-  });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { min.minloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
 
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { 
-    min.minloc( working_array[idx] * factor, idx);
-  });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { min.minloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
-   
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest);
 template <typename T>
 class ForallReduceMinLocBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
 {
@@ -109,67 +110,67 @@ TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r1, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r2, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r3, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
+                                                              working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r4, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r5, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                              working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    l1, seg_idx, working_res);
+                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                                  REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest,
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp
index 2203aedd1b..11112841b0 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp
@@ -13,35 +13,36 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
-void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg, 
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
                                   camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_sum = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_sum += test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_sum += test_array[seg_idx[i]];
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -50,10 +51,12 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum(0);
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum2(2);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    sum  += working_array[idx];
-    sum2 += working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            {
+                              sum += working_array[idx];
+                              sum2 += working_array[idx];
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum.get()), ref_sum);
   ASSERT_EQ(static_cast<DATA_TYPE>(sum2.get()), ref_sum + 2);
@@ -62,18 +65,16 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
 
   const int nloops = 2;
 
-  for (int j = 0; j < nloops; ++j) {
-    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-      sum += working_array[idx];
-    });
+  for (int j = 0; j < nloops; ++j)
+  {
+    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                              { sum += working_array[idx]; });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum.get()), nloops * ref_sum);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
@@ -81,8 +82,7 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(ForallReduceSumBasicTest);
 template <typename T>
 class ForallReduceSumBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
 {
@@ -92,70 +92,66 @@ TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r1, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r2, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r3, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r4, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                           working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r5, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                           working_res);
 
-// List segment tests
-  seg_idx.clear(); 
+  // List segment tests
+  seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(), 
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 l1, seg_idx, working_res);
+                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumBasicTest,
-                            ReduceSumBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumBasicTest, ReduceSumBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCESUM_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp
index c64106fc59..cad50d5843 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp
@@ -13,9 +13,11 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
                                      camp::resources::Resource working_res)
@@ -23,33 +25,30 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
   using REF_BITAND = RAJA::expt::ValOp<DATA_TYPE, RAJA::operators::bit_and>;
 
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
   //
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
     test_array[i] = 13;
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE simpand(21);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::Reduce<RAJA::operators::bit_and>(&simpand),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITAND & _simpand) {
-      _simpand &= working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::bit_and>(&simpand),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITAND & _simpand)
+      { _simpand &= working_array[idx]; });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpand), 5);
 
@@ -60,27 +59,30 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE ref_and = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_and &= test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_and &= test_array[seg_idx[i]];
   }
 
   DATA_TYPE redand(0);
   DATA_TYPE redand2(2);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
-    RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand2),
-    RAJA::expt::KernelName("RAJA Reduce BitAnd"),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITAND &r1, REF_BITAND &r2) {
-      r1 &= working_array[idx];
-      r2 &= working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
+      RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand2),
+      RAJA::expt::KernelName("RAJA Reduce BitAnd"),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITAND & r1, REF_BITAND & r2)
+      {
+        r1 &= working_array[idx];
+        r2 &= working_array[idx];
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand), ref_and);
   ASSERT_EQ(static_cast<DATA_TYPE>(redand2), ref_and);
@@ -88,20 +90,18 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
   redand = 0;
 
   const int nloops = 3;
-  for (int j = 0; j < nloops; ++j) {
-    RAJA::forall<EXEC_POLICY>(seg,
-      RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITAND &r1) {
-        r1 &= working_array[idx];
-    });
+  for (int j = 0; j < nloops; ++j)
+  {
+    RAJA::forall<EXEC_POLICY>(
+        seg, RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
+        [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITAND & r1)
+        { r1 &= working_array[idx]; });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand), ref_and);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
@@ -109,8 +109,7 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(ForallReduceBitAndBasicTest);
 template <typename T>
 class ForallReduceBitAndBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
 {
@@ -120,67 +119,67 @@ TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r1, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r2, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r3, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
+                                                              working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r4, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r5, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                              working_res);
 
   // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    l1, seg_idx, working_res);
+                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                                  REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitAndBasicTest,
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp
index 8c3ea14c4d..65a0deccd5 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp
@@ -13,74 +13,76 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
-                                     const std::vector<IDX_TYPE>& seg_idx,
-                                     camp::resources::Resource working_res)
+                                    const std::vector<IDX_TYPE>& seg_idx,
+                                    camp::resources::Resource working_res)
 {
   using REF_BITOR = RAJA::expt::ValOp<DATA_TYPE, RAJA::operators::bit_or>;
 
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
   //
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
     test_array[i] = 9;
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE simpor(5);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::Reduce<RAJA::operators::bit_or>(&simpor),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITOR & _simpor) {
-      _simpor |= working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::bit_or>(&simpor),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITOR & _simpor)
+      { _simpor |= working_array[idx]; });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpor), 13);
 
- 
+
   //
   // And now a randomized test that pushes zeros around
   //
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE ref_or = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_or |= test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_or |= test_array[seg_idx[i]];
   }
 
   DATA_TYPE redor(0);
   DATA_TYPE redor2(2);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor),
-    RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor2),
-    RAJA::expt::KernelName("RAJA Reduce BitOr"),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITOR &r1, REF_BITOR &r2) {
-      r1 |= working_array[idx];
-      r2 |= working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor),
+      RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor2),
+      RAJA::expt::KernelName("RAJA Reduce BitOr"),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITOR & r1, REF_BITOR & r2)
+      {
+        r1 |= working_array[idx];
+        r2 |= working_array[idx];
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redor), ref_or);
   ASSERT_EQ(static_cast<DATA_TYPE>(redor2), ref_or);
@@ -88,28 +90,25 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
   redor = 0;
 
   const int nloops = 3;
-  for (int j = 0; j < nloops; ++j) {
-    RAJA::forall<EXEC_POLICY>(seg,
-      RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITOR &r1) {
-        r1 |= working_array[idx];
-    });
+  for (int j = 0; j < nloops; ++j)
+  {
+    RAJA::forall<EXEC_POLICY>(
+        seg, RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor),
+        [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITOR & r1)
+        { r1 |= working_array[idx]; });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redor), ref_or);
-   
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest);
 template <typename T>
 class ForallReduceBitOrBasicTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
@@ -120,70 +119,66 @@ TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-    camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r1, seg_idx, working_res);
+                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                                 REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r2, seg_idx, working_res);
+                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                                 REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r3, seg_idx, working_res);
+                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                                 REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r4, seg_idx, working_res);
+                                 EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                             working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r5, seg_idx, working_res);
+                                 EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                             working_res);
 
   // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedListSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   l1, seg_idx, working_res);
+                                 RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                                 REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest,
-                            ReduceBitOrBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCEBITOR_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp
index 773c737a85..4b8d23cb2e 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp
@@ -13,9 +13,11 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
                                   camp::resources::Resource working_res)
@@ -23,29 +25,28 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
   using REF_MAX = RAJA::expt::ValOp<DATA_TYPE, RAJA::operators::maximum>;
 
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
- 
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
+
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
-  const int modval = 100;
+  const int modval         = 100;
   const DATA_TYPE max_init = -1;
-  const DATA_TYPE big_max = modval + 1;
+  const DATA_TYPE big_max  = modval + 1;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_max = max_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_max = RAJA_MAX(test_array[ seg_idx[i] ], ref_max); 
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_max = RAJA_MAX(test_array[seg_idx[i]], ref_max);
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -53,14 +54,15 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
   DATA_TYPE maxinit(big_max);
   DATA_TYPE max(max_init);
 
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&maxinit),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-    RAJA::expt::KernelName("RAJA Reduce Max"),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MAX &mi, REF_MAX &m) {
-      mi.max(working_array[idx]);
-      m.max(working_array[idx]);
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::maximum>(&maxinit),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+      RAJA::expt::KernelName("RAJA Reduce Max"),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MAX & mi, REF_MAX & m)
+      {
+        mi.max(working_array[idx]);
+        m.max(working_array[idx]);
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(maxinit), big_max);
   ASSERT_EQ(static_cast<DATA_TYPE>(max), ref_max);
@@ -69,33 +71,28 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(max), max_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MAX &m) {
-      m.max(working_array[idx] * factor);
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MAX & m)
+                            { m.max(working_array[idx] * factor); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max), ref_max * factor);
-   
+
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MAX &m) {
-      m.max(working_array[idx] * factor);
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MAX & m)
+                            { m.max(working_array[idx] * factor); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max), ref_max * factor);
-   
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest);
 template <typename T>
 class ForallReduceMaxBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
 {
@@ -105,70 +102,66 @@ TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r1, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r2, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r3, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r4, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                           working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r5, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                           working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 l1, seg_idx, working_res);
+                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest,
-                            ReduceMaxBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCEMAX_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp
index 225018eac8..107e29b925 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp
@@ -13,63 +13,68 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE& seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
                                      camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
-  const int modval = 100;
-  const DATA_TYPE max_init = -modval;
+  const int modval           = 100;
+  const DATA_TYPE max_init   = -modval;
   const IDX_TYPE maxloc_init = -1;
-  const IDX_TYPE maxloc_idx = seg_idx[ idx_len * 2/3 ];
-  const DATA_TYPE big_max = modval*10;
-  const IDX_TYPE big_maxloc = maxloc_init;
+  const IDX_TYPE maxloc_idx  = seg_idx[idx_len * 2 / 3];
+  const DATA_TYPE big_max    = modval * 10;
+  const IDX_TYPE big_maxloc  = maxloc_init;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( 1000 % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(1000 % modval);
   }
   test_array[maxloc_idx] = static_cast<DATA_TYPE>(big_max);
 
-  DATA_TYPE ref_max = max_init;
+  DATA_TYPE ref_max   = max_init;
   IDX_TYPE ref_maxloc = maxloc_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    if ( test_array[ seg_idx[i] ] > ref_max ) {
-       ref_max = test_array[ seg_idx[i] ];
-       ref_maxloc = seg_idx[i];
-    } 
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    if (test_array[seg_idx[i]] > ref_max)
+    {
+      ref_max    = test_array[seg_idx[i]];
+      ref_maxloc = seg_idx[i];
+    }
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
 
   using VL_TYPE = RAJA::expt::ValLoc<DATA_TYPE, IDX_TYPE>;
-  using VL_LAMBDA_TYPE = RAJA::expt::ValLocOp<DATA_TYPE, IDX_TYPE, RAJA::operators::maximum>;
+  using VL_LAMBDA_TYPE =
+      RAJA::expt::ValLocOp<DATA_TYPE, IDX_TYPE, RAJA::operators::maximum>;
   VL_TYPE maxinit(big_max, maxloc_init);
   VL_TYPE max(max_init, maxloc_init);
 
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&maxinit),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-    RAJA::expt::KernelName("RAJA Reduce MaxLoc"),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE &mi, VL_LAMBDA_TYPE &m) {
-      mi.maxloc( working_array[idx], idx );
-      m.maxloc( working_array[idx], idx );
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::maximum>(&maxinit),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+      RAJA::expt::KernelName("RAJA Reduce MaxLoc"),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE & mi,
+                           VL_LAMBDA_TYPE & m)
+      {
+        mi.maxloc(working_array[idx], idx);
+        m.maxloc(working_array[idx], idx);
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(maxinit.getVal()), big_max);
   ASSERT_EQ(static_cast<IDX_TYPE>(maxinit.getLoc()), big_maxloc);
@@ -81,25 +86,21 @@ void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE& seg,
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), maxloc_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE &m) {
-      m.maxloc( working_array[idx] * factor, idx);
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE & m)
+      { m.maxloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.getVal()), ref_max * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest);
 template <typename T>
 class ForallReduceMaxLocBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
 {
@@ -109,67 +110,67 @@ TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r1, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r2, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r3, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
+                                                              working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r4, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r5, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                              working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    l1, seg_idx, working_res);
+                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                                  REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest,
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLocAlt.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLocAlt.hpp
index 68810ac893..e3391b0e42 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLocAlt.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLocAlt.hpp
@@ -13,63 +13,68 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMaxLocAltBasicTestImpl(const SEG_TYPE& seg,
-                                     const std::vector<IDX_TYPE>& seg_idx,
-                                     camp::resources::Resource working_res)
+                                        const std::vector<IDX_TYPE>& seg_idx,
+                                        camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
-  const int modval = 100;
-  const DATA_TYPE max_init = -modval;
+  const int modval           = 100;
+  const DATA_TYPE max_init   = -modval;
   const IDX_TYPE maxloc_init = -1;
-  const IDX_TYPE maxloc_idx = seg_idx[ idx_len * 2/3 ];
-  const DATA_TYPE big_max = modval*10;
-  const IDX_TYPE big_maxloc = maxloc_init;
+  const IDX_TYPE maxloc_idx  = seg_idx[idx_len * 2 / 3];
+  const DATA_TYPE big_max    = modval * 10;
+  const IDX_TYPE big_maxloc  = maxloc_init;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( 1000 % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(1000 % modval);
   }
   test_array[maxloc_idx] = static_cast<DATA_TYPE>(big_max);
 
-  DATA_TYPE ref_max = max_init;
+  DATA_TYPE ref_max   = max_init;
   IDX_TYPE ref_maxloc = maxloc_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    if ( test_array[ seg_idx[i] ] > ref_max ) {
-       ref_max = test_array[ seg_idx[i] ];
-       ref_maxloc = seg_idx[i];
-    } 
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    if (test_array[seg_idx[i]] > ref_max)
+    {
+      ref_max    = test_array[seg_idx[i]];
+      ref_maxloc = seg_idx[i];
+    }
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
 
   using VL_TYPE = RAJA::expt::ValLoc<DATA_TYPE, IDX_TYPE>;
-  using VL_LAMBDA_TYPE = RAJA::expt::ValLocOp<DATA_TYPE, IDX_TYPE, RAJA::operators::maximum>;
+  using VL_LAMBDA_TYPE =
+      RAJA::expt::ValLocOp<DATA_TYPE, IDX_TYPE, RAJA::operators::maximum>;
   VL_TYPE maxinit(big_max, maxloc_init);
   VL_TYPE max(max_init, maxloc_init);
 
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&maxinit),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-    RAJA::expt::KernelName("RAJA Reduce MaxLoc"),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE &mi, VL_LAMBDA_TYPE &m) {
-      mi.maxloc( working_array[idx], idx );
-      m.maxloc( working_array[idx], idx );
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::maximum>(&maxinit),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+      RAJA::expt::KernelName("RAJA Reduce MaxLoc"),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE & mi,
+                           VL_LAMBDA_TYPE & m)
+      {
+        mi.maxloc(working_array[idx], idx);
+        m.maxloc(working_array[idx], idx);
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(maxinit.getVal()), big_max);
   ASSERT_EQ(static_cast<IDX_TYPE>(maxinit.getLoc()), big_maxloc);
@@ -78,49 +83,47 @@ void ForallReduceMaxLocAltBasicTestImpl(const SEG_TYPE& seg,
 
   VL_TYPE max2(max_init, maxloc_init);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&max2),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE RAJA_UNUSED_ARG(idx), VL_LAMBDA_TYPE &m2) {
-      m2.max( max );
-  });
-  ASSERT_EQ(static_cast<DATA_TYPE>(max2.getVal()), static_cast<DATA_TYPE>(max.getVal()));
-  ASSERT_EQ(static_cast<IDX_TYPE>(max2.getLoc()), static_cast<IDX_TYPE>(max.getLoc()));
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::maximum>(&max2),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE RAJA_UNUSED_ARG(idx), VL_LAMBDA_TYPE & m2)
+      { m2.max(max); });
+  ASSERT_EQ(static_cast<DATA_TYPE>(max2.getVal()),
+            static_cast<DATA_TYPE>(max.getVal()));
+  ASSERT_EQ(static_cast<IDX_TYPE>(max2.getLoc()),
+            static_cast<IDX_TYPE>(max.getLoc()));
 
-  DATA_TYPE s_max = max_init;
+  DATA_TYPE s_max   = max_init;
   IDX_TYPE s_maxloc = maxloc_init;
 
   const int factor = 4;
-  RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::ReduceLoc<RAJA::operators::maximum>(&s_max, &s_maxloc),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE &m) {
-      m.maxloc( working_array[idx] * factor, idx);
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::ReduceLoc<RAJA::operators::maximum>(&s_max, &s_maxloc),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE & m)
+      { m.maxloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(s_max), ref_max * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(s_maxloc), ref_maxloc);
 
-  DATA_TYPE s_max2 = max_init;
+  DATA_TYPE s_max2   = max_init;
   IDX_TYPE s_maxloc2 = maxloc_init;
 
-  RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::ReduceLoc<RAJA::operators::maximum>(&s_max2, &s_maxloc2),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE RAJA_UNUSED_ARG(idx), VL_LAMBDA_TYPE &m2) {
-      m2.max(max2);
-  });
-  ASSERT_EQ(static_cast<DATA_TYPE>(s_max2), static_cast<DATA_TYPE>(max2.getVal()));
-  ASSERT_EQ(static_cast<IDX_TYPE>(s_maxloc2), static_cast<IDX_TYPE>(max2.getLoc()));
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::ReduceLoc<RAJA::operators::maximum>(&s_max2, &s_maxloc2),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE RAJA_UNUSED_ARG(idx), VL_LAMBDA_TYPE & m2)
+      { m2.max(max2); });
+  ASSERT_EQ(static_cast<DATA_TYPE>(s_max2),
+            static_cast<DATA_TYPE>(max2.getVal()));
+  ASSERT_EQ(static_cast<IDX_TYPE>(s_maxloc2),
+            static_cast<IDX_TYPE>(max2.getLoc()));
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxLocAltBasicTest);
 template <typename T>
 class ForallReduceMaxLocAltBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMaxLocAltBasicTest, ReduceMaxLocAltBasicForall)
 {
@@ -130,67 +133,68 @@ TYPED_TEST_P(ForallReduceMaxLocAltBasicTest, ReduceMaxLocAltBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceMaxLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r1, seg_idx, working_res);
+                                     RAJA::TypedRangeSegment<IDX_TYPE>,
+                                     EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
+                                                                 working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceMaxLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r2, seg_idx, working_res);
+                                     RAJA::TypedRangeSegment<IDX_TYPE>,
+                                     EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
+                                                                 working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceMaxLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r3, seg_idx, working_res);
+                                     RAJA::TypedRangeSegment<IDX_TYPE>,
+                                     EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
+                                                                 working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceMaxLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r4, seg_idx, working_res);
+                                     RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                     EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                                 working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceMaxLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r5, seg_idx, working_res);
+                                     RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                     EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                                 working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceMaxLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    l1, seg_idx, working_res);
+                                     RAJA::TypedListSegment<IDX_TYPE>,
+                                     EXEC_POLICY, REDUCE_POLICY>(l1, seg_idx,
+                                                                 working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxLocAltBasicTest,
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp
index 9aab696301..6e102db0d0 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp
@@ -13,9 +13,11 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
                                   camp::resources::Resource working_res)
@@ -23,29 +25,28 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
   using REF_MIN = RAJA::expt::ValOp<DATA_TYPE, RAJA::operators::minimum>;
 
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
-  const int modval = 100;
-  const DATA_TYPE min_init = modval+1;
+  const int modval          = 100;
+  const DATA_TYPE min_init  = modval + 1;
   const DATA_TYPE small_min = -modval;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_min = min_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_min = RAJA_MIN(test_array[ seg_idx[i] ], ref_min); 
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_min = RAJA_MIN(test_array[seg_idx[i]], ref_min);
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -53,14 +54,15 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
   DATA_TYPE mininit(small_min);
   DATA_TYPE min(min_init);
 
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-    RAJA::expt::KernelName("RAJA Reduce Min"),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MIN &mi, REF_MIN &m) {
-      mi.min(working_array[idx]);
-      m.min(working_array[idx]);
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+      RAJA::expt::KernelName("RAJA Reduce Min"),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MIN & mi, REF_MIN & m)
+      {
+        mi.min(working_array[idx]);
+        m.min(working_array[idx]);
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit), small_min);
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min);
@@ -68,25 +70,21 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
   min = min_init;
   ASSERT_EQ(static_cast<DATA_TYPE>(min), min_init);
 
-  DATA_TYPE factor = 3; 
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MIN &m) {
-      m.min(working_array[idx] * factor);
-  });
+  DATA_TYPE factor = 3;
+  RAJA::forall<EXEC_POLICY>(seg,
+                            RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MIN & m)
+                            { m.min(working_array[idx] * factor); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
 
   factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MIN &m) {
-      m.min(working_array[idx] * factor);
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MIN & m)
+                            { m.min(working_array[idx] * factor); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
@@ -94,8 +92,7 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(ForallReduceMinBasicTest);
 template <typename T>
 class ForallReduceMinBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
 {
@@ -105,70 +102,66 @@ TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r1, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r2, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r3, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r4, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                           working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r5, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                           working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 l1, seg_idx, working_res);
+                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinBasicTest,
-                            ReduceMinBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinBasicTest, ReduceMinBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCEMIN_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp
index dc48b403ea..508f64bc4a 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp
@@ -13,62 +13,67 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMinLocBasicTestImpl(const SEG_TYPE& seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
                                      camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
-
-  const int modval = 100;
-  const DATA_TYPE min_init = modval+1;
-  const IDX_TYPE minloc_init = -1;
-  const IDX_TYPE minloc_idx = seg_idx[ idx_len * 2/3 ];
-  const DATA_TYPE small_min = -modval;
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
+
+  const int modval            = 100;
+  const DATA_TYPE min_init    = modval + 1;
+  const IDX_TYPE minloc_init  = -1;
+  const IDX_TYPE minloc_idx   = seg_idx[idx_len * 2 / 3];
+  const DATA_TYPE small_min   = -modval;
   const IDX_TYPE small_minloc = minloc_init;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
   test_array[minloc_idx] = static_cast<DATA_TYPE>(small_min);
 
-  DATA_TYPE ref_min = min_init;
+  DATA_TYPE ref_min   = min_init;
   IDX_TYPE ref_minloc = minloc_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    if ( test_array[ seg_idx[i] ] < ref_min ) {
-       ref_min = test_array[ seg_idx[i] ];
-       ref_minloc = seg_idx[i];
-    } 
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    if (test_array[seg_idx[i]] < ref_min)
+    {
+      ref_min    = test_array[seg_idx[i]];
+      ref_minloc = seg_idx[i];
+    }
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   using VL_TYPE = RAJA::expt::ValLoc<DATA_TYPE, IDX_TYPE>;
-  using VL_LAMBDA_TYPE = RAJA::expt::ValLocOp<DATA_TYPE, IDX_TYPE, RAJA::operators::minimum>;
+  using VL_LAMBDA_TYPE =
+      RAJA::expt::ValLocOp<DATA_TYPE, IDX_TYPE, RAJA::operators::minimum>;
   VL_TYPE mininit(small_min, minloc_init);
   VL_TYPE min(min_init, minloc_init);
 
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-    RAJA::expt::KernelName("RAJA Reduce MinLoc"),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE &mi, VL_LAMBDA_TYPE &m) {
-      mi.minloc( working_array[idx], idx );
-      m.minloc( working_array[idx], idx );
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+      RAJA::expt::KernelName("RAJA Reduce MinLoc"),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE & mi,
+                           VL_LAMBDA_TYPE & m)
+      {
+        mi.minloc(working_array[idx], idx);
+        m.minloc(working_array[idx], idx);
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit.getVal()), small_min);
   ASSERT_EQ(static_cast<IDX_TYPE>(mininit.getLoc()), small_minloc);
@@ -80,25 +85,21 @@ void ForallReduceMinLocBasicTestImpl(const SEG_TYPE& seg,
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), minloc_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE &m) {
-      m.minloc( working_array[idx] * factor, idx);
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE & m)
+      { m.minloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min.getVal()), ref_min * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest);
 template <typename T>
 class ForallReduceMinLocBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
 {
@@ -108,67 +109,67 @@ TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r1, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r2, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r3, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
+                                                              working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r4, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r5, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                              working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    l1, seg_idx, working_res);
+                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                                  REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest,
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLocAlt.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLocAlt.hpp
index 07a6058234..784f7ebdaa 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLocAlt.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLocAlt.hpp
@@ -13,62 +13,67 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMinLocAltBasicTestImpl(const SEG_TYPE& seg,
-                                     const std::vector<IDX_TYPE>& seg_idx,
-                                     camp::resources::Resource working_res)
+                                        const std::vector<IDX_TYPE>& seg_idx,
+                                        camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
-
-  const int modval = 100;
-  const DATA_TYPE min_init = modval+1;
-  const IDX_TYPE minloc_init = -1;
-  const IDX_TYPE minloc_idx = seg_idx[ idx_len * 2/3 ];
-  const DATA_TYPE small_min = -modval;
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
+
+  const int modval            = 100;
+  const DATA_TYPE min_init    = modval + 1;
+  const IDX_TYPE minloc_init  = -1;
+  const IDX_TYPE minloc_idx   = seg_idx[idx_len * 2 / 3];
+  const DATA_TYPE small_min   = -modval;
   const IDX_TYPE small_minloc = minloc_init;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
   test_array[minloc_idx] = static_cast<DATA_TYPE>(small_min);
 
-  DATA_TYPE ref_min = min_init;
+  DATA_TYPE ref_min   = min_init;
   IDX_TYPE ref_minloc = minloc_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    if ( test_array[ seg_idx[i] ] < ref_min ) {
-       ref_min = test_array[ seg_idx[i] ];
-       ref_minloc = seg_idx[i];
-    } 
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    if (test_array[seg_idx[i]] < ref_min)
+    {
+      ref_min    = test_array[seg_idx[i]];
+      ref_minloc = seg_idx[i];
+    }
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   using VL_TYPE = RAJA::expt::ValLoc<DATA_TYPE, IDX_TYPE>;
-  using VL_LAMBDA_TYPE = RAJA::expt::ValLocOp<DATA_TYPE, IDX_TYPE, RAJA::operators::minimum>;
+  using VL_LAMBDA_TYPE =
+      RAJA::expt::ValLocOp<DATA_TYPE, IDX_TYPE, RAJA::operators::minimum>;
   VL_TYPE mininit(small_min, minloc_init);
   VL_TYPE min(min_init, minloc_init);
 
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-    RAJA::expt::KernelName("RAJA Reduce MinLoc"),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE &mi, VL_LAMBDA_TYPE &m) {
-      mi.minloc( working_array[idx], idx );
-      m.minloc( working_array[idx], idx );
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+      RAJA::expt::KernelName("RAJA Reduce MinLoc"),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE & mi,
+                           VL_LAMBDA_TYPE & m)
+      {
+        mi.minloc(working_array[idx], idx);
+        m.minloc(working_array[idx], idx);
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit.getVal()), small_min);
   ASSERT_EQ(static_cast<IDX_TYPE>(mininit.getLoc()), small_minloc);
@@ -77,49 +82,47 @@ void ForallReduceMinLocAltBasicTestImpl(const SEG_TYPE& seg,
 
   VL_TYPE min2(min_init, minloc_init);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&min2),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE RAJA_UNUSED_ARG(idx), VL_LAMBDA_TYPE &m2) {
-      m2.min( min );
-  });
-  ASSERT_EQ(static_cast<DATA_TYPE>(min2.getVal()), static_cast<DATA_TYPE>(min.getVal()));
-  ASSERT_EQ(static_cast<IDX_TYPE>(min2.getLoc()), static_cast<IDX_TYPE>(min.getLoc()));
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::minimum>(&min2),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE RAJA_UNUSED_ARG(idx), VL_LAMBDA_TYPE & m2)
+      { m2.min(min); });
+  ASSERT_EQ(static_cast<DATA_TYPE>(min2.getVal()),
+            static_cast<DATA_TYPE>(min.getVal()));
+  ASSERT_EQ(static_cast<IDX_TYPE>(min2.getLoc()),
+            static_cast<IDX_TYPE>(min.getLoc()));
 
-  DATA_TYPE s_min = min_init;
+  DATA_TYPE s_min   = min_init;
   IDX_TYPE s_minloc = minloc_init;
 
   const int factor = 4;
-  RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::ReduceLoc<RAJA::operators::minimum>(&s_min, &s_minloc),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE &m) {
-      m.minloc( working_array[idx] * factor, idx);
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::ReduceLoc<RAJA::operators::minimum>(&s_min, &s_minloc),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE & m)
+      { m.minloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(s_min), ref_min * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(s_minloc), ref_minloc);
 
-  DATA_TYPE s_min2 = min_init;
+  DATA_TYPE s_min2   = min_init;
   IDX_TYPE s_minloc2 = minloc_init;
 
-  RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::ReduceLoc<RAJA::operators::minimum>(&s_min2, &s_minloc2),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE RAJA_UNUSED_ARG(idx), VL_LAMBDA_TYPE &m2) {
-      m2.min(min2);
-  });
-  ASSERT_EQ(static_cast<DATA_TYPE>(s_min2), static_cast<DATA_TYPE>(min2.getVal()));
-  ASSERT_EQ(static_cast<IDX_TYPE>(s_minloc2), static_cast<IDX_TYPE>(min2.getLoc()));
-   
-
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::ReduceLoc<RAJA::operators::minimum>(&s_min2, &s_minloc2),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE RAJA_UNUSED_ARG(idx), VL_LAMBDA_TYPE & m2)
+      { m2.min(min2); });
+  ASSERT_EQ(static_cast<DATA_TYPE>(s_min2),
+            static_cast<DATA_TYPE>(min2.getVal()));
+  ASSERT_EQ(static_cast<IDX_TYPE>(s_minloc2),
+            static_cast<IDX_TYPE>(min2.getLoc()));
+
+
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinLocAltBasicTest);
 template <typename T>
 class ForallReduceMinLocAltBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMinLocAltBasicTest, ReduceMinLocAltBasicForall)
 {
@@ -129,67 +132,68 @@ TYPED_TEST_P(ForallReduceMinLocAltBasicTest, ReduceMinLocAltBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceMinLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r1, seg_idx, working_res);
+                                     RAJA::TypedRangeSegment<IDX_TYPE>,
+                                     EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
+                                                                 working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceMinLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r2, seg_idx, working_res);
+                                     RAJA::TypedRangeSegment<IDX_TYPE>,
+                                     EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
+                                                                 working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceMinLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r3, seg_idx, working_res);
+                                     RAJA::TypedRangeSegment<IDX_TYPE>,
+                                     EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
+                                                                 working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceMinLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r4, seg_idx, working_res);
+                                     RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                     EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                                 working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceMinLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r5, seg_idx, working_res);
+                                     RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                     EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                                 working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceMinLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    l1, seg_idx, working_res);
+                                     RAJA::TypedListSegment<IDX_TYPE>,
+                                     EXEC_POLICY, REDUCE_POLICY>(l1, seg_idx,
+                                                                 working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinLocAltBasicTest,
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp
index 4105700f7c..d4abbf0192 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp
@@ -13,37 +13,38 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
-void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg, 
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
                                   camp::resources::Resource working_res)
 {
   using REF_SUM = RAJA::expt::ValOp<DATA_TYPE, RAJA::operators::plus>;
 
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_sum = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_sum += test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_sum += test_array[seg_idx[i]];
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -51,14 +52,15 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
   DATA_TYPE sum(0);
   DATA_TYPE sum2(2);
 
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
-    RAJA::expt::Reduce<RAJA::operators::plus>(&sum2),
-    RAJA::expt::KernelName("RAJA Reduce Sum"),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_SUM &s1, REF_SUM &s2) {
-      s1 += working_array[idx];
-      s2 += working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
+      RAJA::expt::Reduce<RAJA::operators::plus>(&sum2),
+      RAJA::expt::KernelName("RAJA Reduce Sum"),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_SUM & s1, REF_SUM & s2)
+      {
+        s1 += working_array[idx];
+        s2 += working_array[idx];
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum), ref_sum);
   ASSERT_EQ(static_cast<DATA_TYPE>(sum2), ref_sum + 2);
@@ -67,20 +69,18 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
 
   const int nloops = 2;
 
-  for (int j = 0; j < nloops; ++j) {
-    RAJA::forall<EXEC_POLICY>(seg, 
-      RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_SUM &s) {
-        s += working_array[idx];
-    });
+  for (int j = 0; j < nloops; ++j)
+  {
+    RAJA::forall<EXEC_POLICY>(seg,
+                              RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_SUM & s)
+                              { s += working_array[idx]; });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum), nloops * ref_sum);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
@@ -88,8 +88,7 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(ForallReduceSumBasicTest);
 template <typename T>
 class ForallReduceSumBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
 {
@@ -99,70 +98,66 @@ TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r1, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r2, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r3, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r4, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                           working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r5, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                           working_res);
 
-// List segment tests
-  seg_idx.clear(); 
+  // List segment tests
+  seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(), 
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 l1, seg_idx, working_res);
+                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumBasicTest,
-                            ReduceSumBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumBasicTest, ReduceSumBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCESUM_HPP__
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp
index 6335affc02..dc7be52f55 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp
@@ -19,12 +19,14 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <typename IDX_TYPE, typename WORKING_RES,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename IDX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallIndexSetReduceMaxMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
-  using IdxSetType = RAJA::TypedIndexSet<RangeSegType>;
+  using IdxSetType   = RAJA::TypedIndexSet<RangeSegType>;
 
   RAJA::TypedRangeSegment<IDX_TYPE> r1(1, 1037);
   RAJA::TypedRangeSegment<IDX_TYPE> r2(1043, 2036);
@@ -39,21 +41,19 @@ void ForallIndexSetReduceMaxMultipleTestImpl()
 
   const IDX_TYPE alen = 15286;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   double* working_array;
   double* check_array;
   double* test_array;
 
-  allocateForallTestData<double>(alen,
-                                 working_res,
-                                 &working_array,
-                                 &check_array,
-                                 &test_array);
+  allocateForallTestData<double>(alen, working_res, &working_array,
+                                 &check_array, &test_array);
 
   const double default_val = -DBL_MAX;
 
-  for (IDX_TYPE i = 0; i < alen; ++i) {
+  for (IDX_TYPE i = 0; i < alen; ++i)
+  {
     test_array[i] = default_val;
   }
 
@@ -62,49 +62,49 @@ void ForallIndexSetReduceMaxMultipleTestImpl()
   std::mt19937 mt(rd());
   std::uniform_real_distribution<double> dist(-10, 10);
 
-  double current_max = default_val;
+  double current_max    = default_val;
   const int test_repeat = 4;
 
   RAJA::ReduceMax<REDUCE_POLICY, double> dmax0(default_val);
   RAJA::ReduceMax<REDUCE_POLICY, double> dmax1(default_val);
 
-  for (int tcount = 1; tcount <= test_repeat; ++tcount) {
-
-     // pick an index in one of the segments
-     int index = 5127;  // seg 3
-     if (tcount == 2) index = 1938; // seg2
-     if (tcount == 3) index = 13333; // seg4
-     if (tcount == 4) index = 52; // seg1
-
-     double droll = dist(mt);
-     if (test_array[index] > droll) {
-       test_array[index] = droll;
-       current_max = RAJA_MAX(current_max, droll);
-     }
-
-     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
-
-     RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
-       dmax0.max(working_array[i]);
-       dmax1.max(2 * working_array[i]);
-     });
-
-     ASSERT_FLOAT_EQ(static_cast<double>(dmax0.get()), current_max);
-     ASSERT_FLOAT_EQ(static_cast<double>(dmax1.get()), 2 * current_max);
-
+  for (int tcount = 1; tcount <= test_repeat; ++tcount)
+  {
+
+    // pick an index in one of the segments
+    int index = 5127;                // seg 3
+    if (tcount == 2) index = 1938;   // seg2
+    if (tcount == 3) index = 13333;  // seg4
+    if (tcount == 4) index = 52;     // seg1
+
+    double droll = dist(mt);
+    if (test_array[index] > droll)
+    {
+      test_array[index] = droll;
+      current_max       = RAJA_MAX(current_max, droll);
+    }
+
+    working_res.memcpy(working_array, test_array, sizeof(double) * alen);
+
+    RAJA::forall<EXEC_POLICY>(iset,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+                              {
+                                dmax0.max(working_array[i]);
+                                dmax1.max(2 * working_array[i]);
+                              });
+
+    ASSERT_FLOAT_EQ(static_cast<double>(dmax0.get()), current_max);
+    ASSERT_FLOAT_EQ(static_cast<double>(dmax1.get()), 2 * current_max);
   }
 
-  deallocateForallTestData<double>(working_res,
-                                   working_array,
-                                   check_array,
+  deallocateForallTestData<double>(working_res, working_array, check_array,
                                    test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxMultipleTest);
 template <typename T>
 class ForallIndexSetReduceMaxMultipleTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallIndexSetReduceMaxMultipleTest,
              ReduceMaxMultipleForallIndexSet)
@@ -114,8 +114,8 @@ TYPED_TEST_P(ForallIndexSetReduceMaxMultipleTest,
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceMaxMultipleTestImpl<IDX_TYPE, WORKING_RES,
-                                          EXEC_POLICY, REDUCE_POLICY>();
+  ForallIndexSetReduceMaxMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
+                                          REDUCE_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxMultipleTest,
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp
index 4d30728fe6..45bb37cbe3 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp
@@ -19,12 +19,14 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <typename IDX_TYPE, typename WORKING_RES, 
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename IDX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallIndexSetReduceMaxLocMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
-  using IdxSetType = RAJA::TypedIndexSet<RangeSegType>;
+  using IdxSetType   = RAJA::TypedIndexSet<RangeSegType>;
 
   RAJA::TypedRangeSegment<IDX_TYPE> r1(1, 1037);
   RAJA::TypedRangeSegment<IDX_TYPE> r2(1043, 2036);
@@ -32,77 +34,76 @@ void ForallIndexSetReduceMaxLocMultipleTestImpl()
   RAJA::TypedRangeSegment<IDX_TYPE> r4(10243, 15286);
 
   IdxSetType iset;
-  iset.push_back(r1); 
-  iset.push_back(r2); 
-  iset.push_back(r3); 
-  iset.push_back(r4); 
+  iset.push_back(r1);
+  iset.push_back(r2);
+  iset.push_back(r3);
+  iset.push_back(r4);
 
   const IDX_TYPE alen = 15286;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   double* working_array;
   double* check_array;
   double* test_array;
 
-  allocateForallTestData<double>(alen,
-                                 working_res,
-                                 &working_array,
-                                 &check_array,
-                                 &test_array);
+  allocateForallTestData<double>(alen, working_res, &working_array,
+                                 &check_array, &test_array);
 
-  double current_max = -DBL_MAX;
+  double current_max   = -DBL_MAX;
   IDX_TYPE current_loc = -1;
 
-  for (IDX_TYPE i = 0; i < alen; ++i) {
+  for (IDX_TYPE i = 0; i < alen; ++i)
+  {
     test_array[i] = current_max;
   }
-  
+
   const int test_repeat = 4;
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, double, IDX_TYPE> dmax0(current_max, current_loc);
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, double, IDX_TYPE> dmax1(current_max, current_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, double, IDX_TYPE> dmax0(current_max,
+                                                            current_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, double, IDX_TYPE> dmax1(current_max,
+                                                            current_loc);
 
-  for (int tcount = 1; tcount <= test_repeat; ++tcount) {
+  for (int tcount = 1; tcount <= test_repeat; ++tcount)
+  {
 
-     // set max val 
-     current_max = 100.0 + tcount * 10.0;
+    // set max val
+    current_max = 100.0 + tcount * 10.0;
 
-     // pick an index in one of the segments
-     current_loc = 5127;  // seg 3
-     if (tcount == 2) current_loc = 1938; // seg2
-     if (tcount == 3) current_loc = 13333; // seg4
-     if (tcount == 4) current_loc = 52; // seg1
+    // pick an index in one of the segments
+    current_loc = 5127;                    // seg 3
+    if (tcount == 2) current_loc = 1938;   // seg2
+    if (tcount == 3) current_loc = 13333;  // seg4
+    if (tcount == 4) current_loc = 52;     // seg1
 
-     test_array[current_loc] = current_max;
- 
-     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
+    test_array[current_loc] = current_max;
 
-     RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
-       dmax0.maxloc(working_array[i], i);
-       dmax1.maxloc(2 * working_array[i], i);
-     });
+    working_res.memcpy(working_array, test_array, sizeof(double) * alen);
 
-     ASSERT_FLOAT_EQ(static_cast<double>(dmax0.get()), current_max);
-     ASSERT_EQ(static_cast<IDX_TYPE>(dmax0.getLoc()), current_loc);
-     ASSERT_FLOAT_EQ(static_cast<double>(dmax1.get()), 2 * current_max);
-     ASSERT_EQ(static_cast<IDX_TYPE>(dmax1.getLoc()), current_loc);
+    RAJA::forall<EXEC_POLICY>(iset,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+                              {
+                                dmax0.maxloc(working_array[i], i);
+                                dmax1.maxloc(2 * working_array[i], i);
+                              });
 
+    ASSERT_FLOAT_EQ(static_cast<double>(dmax0.get()), current_max);
+    ASSERT_EQ(static_cast<IDX_TYPE>(dmax0.getLoc()), current_loc);
+    ASSERT_FLOAT_EQ(static_cast<double>(dmax1.get()), 2 * current_max);
+    ASSERT_EQ(static_cast<IDX_TYPE>(dmax1.getLoc()), current_loc);
   }
 
-  deallocateForallTestData<double>(working_res,
-                                   working_array,
-                                   check_array,
+  deallocateForallTestData<double>(working_res, working_array, check_array,
                                    test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxLocMultipleTest);
 template <typename T>
 class ForallIndexSetReduceMaxLocMultipleTest : public ::testing::Test
-{
-};
+{};
 
-TYPED_TEST_P(ForallIndexSetReduceMaxLocMultipleTest, 
+TYPED_TEST_P(ForallIndexSetReduceMaxLocMultipleTest,
              ReduceMaxLocMultipleForallIndexSet)
 {
   using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
@@ -110,8 +111,8 @@ TYPED_TEST_P(ForallIndexSetReduceMaxLocMultipleTest,
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceMaxLocMultipleTestImpl<IDX_TYPE, WORKING_RES,
-                                             EXEC_POLICY, REDUCE_POLICY>();
+  ForallIndexSetReduceMaxLocMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
+                                             REDUCE_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxLocMultipleTest,
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp
index cf3b60d078..9a0cc3b67f 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp
@@ -19,12 +19,14 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <typename IDX_TYPE, typename WORKING_RES, 
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename IDX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallIndexSetReduceMinMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
-  using IdxSetType = RAJA::TypedIndexSet<RangeSegType>;
+  using IdxSetType   = RAJA::TypedIndexSet<RangeSegType>;
 
   RAJA::TypedRangeSegment<IDX_TYPE> r1(1, 1037);
   RAJA::TypedRangeSegment<IDX_TYPE> r2(1043, 2036);
@@ -32,81 +34,79 @@ void ForallIndexSetReduceMinMultipleTestImpl()
   RAJA::TypedRangeSegment<IDX_TYPE> r4(10243, 15286);
 
   IdxSetType iset;
-  iset.push_back(r1); 
-  iset.push_back(r2); 
-  iset.push_back(r3); 
-  iset.push_back(r4); 
+  iset.push_back(r1);
+  iset.push_back(r2);
+  iset.push_back(r3);
+  iset.push_back(r4);
 
   const IDX_TYPE alen = 15286;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   double* working_array;
   double* check_array;
   double* test_array;
 
-  allocateForallTestData<double>(alen,
-                                 working_res,
-                                 &working_array,
-                                 &check_array,
-                                 &test_array);
+  allocateForallTestData<double>(alen, working_res, &working_array,
+                                 &check_array, &test_array);
 
   const double default_val = DBL_MAX;
 
-  for (IDX_TYPE i = 0; i < alen; ++i) {
+  for (IDX_TYPE i = 0; i < alen; ++i)
+  {
     test_array[i] = default_val;
   }
-  
+
   // for setting random values in arrays
   std::random_device rd;
   std::mt19937 mt(rd());
   std::uniform_real_distribution<double> dist(-10, 10);
 
-  double current_min = default_val;
+  double current_min    = default_val;
   const int test_repeat = 4;
 
   RAJA::ReduceMin<REDUCE_POLICY, double> dmin0(default_val);
   RAJA::ReduceMin<REDUCE_POLICY, double> dmin1(default_val);
 
-  for (int tcount = 1; tcount <= test_repeat; ++tcount) {
-
-     // pick an index in one of the segments
-     int index = 5127;  // seg 3
-     if (tcount == 2) index = 1938; // seg2
-     if (tcount == 3) index = 13333; // seg4
-     if (tcount == 4) index = 52; // seg1
-
-     double droll = dist(mt);
-     if (test_array[index] > droll) {
-       test_array[index] = droll;
-       current_min = RAJA_MIN(current_min, droll);
-     }
- 
-     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
-
-     RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
-       dmin0.min(working_array[i]);
-       dmin1.min(2 * working_array[i]);
-     });
-
-     ASSERT_FLOAT_EQ(static_cast<double>(dmin0.get()), current_min);
-     ASSERT_FLOAT_EQ(static_cast<double>(dmin1.get()), 2 * current_min);
-
+  for (int tcount = 1; tcount <= test_repeat; ++tcount)
+  {
+
+    // pick an index in one of the segments
+    int index = 5127;                // seg 3
+    if (tcount == 2) index = 1938;   // seg2
+    if (tcount == 3) index = 13333;  // seg4
+    if (tcount == 4) index = 52;     // seg1
+
+    double droll = dist(mt);
+    if (test_array[index] > droll)
+    {
+      test_array[index] = droll;
+      current_min       = RAJA_MIN(current_min, droll);
+    }
+
+    working_res.memcpy(working_array, test_array, sizeof(double) * alen);
+
+    RAJA::forall<EXEC_POLICY>(iset,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+                              {
+                                dmin0.min(working_array[i]);
+                                dmin1.min(2 * working_array[i]);
+                              });
+
+    ASSERT_FLOAT_EQ(static_cast<double>(dmin0.get()), current_min);
+    ASSERT_FLOAT_EQ(static_cast<double>(dmin1.get()), 2 * current_min);
   }
 
-  deallocateForallTestData<double>(working_res,
-                                   working_array,
-                                   check_array,
+  deallocateForallTestData<double>(working_res, working_array, check_array,
                                    test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceMinMultipleTest);
 template <typename T>
 class ForallIndexSetReduceMinMultipleTest : public ::testing::Test
-{
-};
+{};
 
-TYPED_TEST_P(ForallIndexSetReduceMinMultipleTest, 
+TYPED_TEST_P(ForallIndexSetReduceMinMultipleTest,
              ReduceMinMultipleForallIndexSet)
 {
   using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
@@ -114,8 +114,8 @@ TYPED_TEST_P(ForallIndexSetReduceMinMultipleTest,
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceMinMultipleTestImpl<IDX_TYPE, WORKING_RES,
-                                          EXEC_POLICY, REDUCE_POLICY>();
+  ForallIndexSetReduceMinMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
+                                          REDUCE_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceMinMultipleTest,
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp
index b8abbd9f67..2b0fa8d43d 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp
@@ -19,12 +19,14 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <typename IDX_TYPE, typename WORKING_RES, 
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename IDX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallIndexSetReduceMinLocMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
-  using IdxSetType = RAJA::TypedIndexSet<RangeSegType>;
+  using IdxSetType   = RAJA::TypedIndexSet<RangeSegType>;
 
   RAJA::TypedRangeSegment<IDX_TYPE> r1(1, 1037);
   RAJA::TypedRangeSegment<IDX_TYPE> r2(1043, 2036);
@@ -32,77 +34,76 @@ void ForallIndexSetReduceMinLocMultipleTestImpl()
   RAJA::TypedRangeSegment<IDX_TYPE> r4(10243, 15286);
 
   IdxSetType iset;
-  iset.push_back(r1); 
-  iset.push_back(r2); 
-  iset.push_back(r3); 
-  iset.push_back(r4); 
+  iset.push_back(r1);
+  iset.push_back(r2);
+  iset.push_back(r3);
+  iset.push_back(r4);
 
   const IDX_TYPE alen = 15286;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   double* working_array;
   double* check_array;
   double* test_array;
 
-  allocateForallTestData<double>(alen,
-                                 working_res,
-                                 &working_array,
-                                 &check_array,
-                                 &test_array);
+  allocateForallTestData<double>(alen, working_res, &working_array,
+                                 &check_array, &test_array);
 
-  double current_min = DBL_MAX;
+  double current_min   = DBL_MAX;
   IDX_TYPE current_loc = -1;
 
-  for (IDX_TYPE i = 0; i < alen; ++i) {
+  for (IDX_TYPE i = 0; i < alen; ++i)
+  {
     test_array[i] = current_min;
   }
-  
+
   const int test_repeat = 4;
 
-  RAJA::ReduceMinLoc<REDUCE_POLICY, double, IDX_TYPE> dmin0(current_min, current_loc);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, double, IDX_TYPE> dmin1(current_min, current_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, double, IDX_TYPE> dmin0(current_min,
+                                                            current_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, double, IDX_TYPE> dmin1(current_min,
+                                                            current_loc);
 
-  for (int tcount = 1; tcount <= test_repeat; ++tcount) {
+  for (int tcount = 1; tcount <= test_repeat; ++tcount)
+  {
 
-     // set min val 
-     current_min = 100.0 - tcount * 10.0;
+    // set min val
+    current_min = 100.0 - tcount * 10.0;
 
-     // pick an index in one of the segments
-     current_loc = 5127;  // seg 3
-     if (tcount == 2) current_loc = 1938; // seg2
-     if (tcount == 3) current_loc = 13333; // seg4
-     if (tcount == 4) current_loc = 52; // seg1
+    // pick an index in one of the segments
+    current_loc = 5127;                    // seg 3
+    if (tcount == 2) current_loc = 1938;   // seg2
+    if (tcount == 3) current_loc = 13333;  // seg4
+    if (tcount == 4) current_loc = 52;     // seg1
 
-     test_array[current_loc] = current_min;
- 
-     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
+    test_array[current_loc] = current_min;
 
-     RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
-       dmin0.minloc(working_array[i], i);
-       dmin1.minloc(2 * working_array[i], i);
-     });
+    working_res.memcpy(working_array, test_array, sizeof(double) * alen);
 
-     ASSERT_FLOAT_EQ(static_cast<double>(dmin0.get()), current_min);
-     ASSERT_EQ(static_cast<IDX_TYPE>(dmin0.getLoc()), current_loc);
-     ASSERT_FLOAT_EQ(static_cast<double>(dmin1.get()), 2 * current_min);
-     ASSERT_EQ(static_cast<IDX_TYPE>(dmin1.getLoc()), current_loc);
+    RAJA::forall<EXEC_POLICY>(iset,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+                              {
+                                dmin0.minloc(working_array[i], i);
+                                dmin1.minloc(2 * working_array[i], i);
+                              });
 
+    ASSERT_FLOAT_EQ(static_cast<double>(dmin0.get()), current_min);
+    ASSERT_EQ(static_cast<IDX_TYPE>(dmin0.getLoc()), current_loc);
+    ASSERT_FLOAT_EQ(static_cast<double>(dmin1.get()), 2 * current_min);
+    ASSERT_EQ(static_cast<IDX_TYPE>(dmin1.getLoc()), current_loc);
   }
 
-  deallocateForallTestData<double>(working_res,
-                                   working_array,
-                                   check_array,
+  deallocateForallTestData<double>(working_res, working_array, check_array,
                                    test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceMinLocMultipleTest);
 template <typename T>
 class ForallIndexSetReduceMinLocMultipleTest : public ::testing::Test
-{
-};
+{};
 
-TYPED_TEST_P(ForallIndexSetReduceMinLocMultipleTest, 
+TYPED_TEST_P(ForallIndexSetReduceMinLocMultipleTest,
              ReduceMinLocMultipleForallIndexSet)
 {
   using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
@@ -110,8 +111,8 @@ TYPED_TEST_P(ForallIndexSetReduceMinLocMultipleTest,
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceMinLocMultipleTestImpl<IDX_TYPE, WORKING_RES,
-                                             EXEC_POLICY, REDUCE_POLICY>();
+  ForallIndexSetReduceMinLocMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
+                                             REDUCE_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceMinLocMultipleTest,
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp
index 88d3f54d7e..7dd8f83844 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp
@@ -17,12 +17,14 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <typename IDX_TYPE, typename WORKING_RES, 
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename IDX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallIndexSetReduceSumMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
-  using IdxSetType = RAJA::TypedIndexSet<RangeSegType>;
+  using IdxSetType   = RAJA::TypedIndexSet<RangeSegType>;
 
   RAJA::TypedRangeSegment<IDX_TYPE> r1(1, 1037);
   RAJA::TypedRangeSegment<IDX_TYPE> r2(1043, 2036);
@@ -30,48 +32,43 @@ void ForallIndexSetReduceSumMultipleTestImpl()
   RAJA::TypedRangeSegment<IDX_TYPE> r4(10243, 15286);
 
   IdxSetType iset;
-  iset.push_back(r1); 
-  iset.push_back(r2); 
-  iset.push_back(r3); 
-  iset.push_back(r4); 
+  iset.push_back(r1);
+  iset.push_back(r2);
+  iset.push_back(r3);
+  iset.push_back(r4);
 
   const IDX_TYPE alen = 15286;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   double* dworking_array;
   double* dcheck_array;
   double* dtest_array;
 
-  allocateForallTestData<double>(alen,
-                                 working_res,
-                                 &dworking_array,
-                                 &dcheck_array,
-                                 &dtest_array);
+  allocateForallTestData<double>(alen, working_res, &dworking_array,
+                                 &dcheck_array, &dtest_array);
 
   int* iworking_array;
   int* icheck_array;
   int* itest_array;
 
-  allocateForallTestData<int>(alen,
-                              working_res,
-                              &iworking_array,
-                              &icheck_array,
+  allocateForallTestData<int>(alen, working_res, &iworking_array, &icheck_array,
                               &itest_array);
 
   const double dinit_val = 0.1;
-  const int iinit_val = 1;
+  const int iinit_val    = 1;
 
-  for (IDX_TYPE i = 0; i < alen; ++i) {
+  for (IDX_TYPE i = 0; i < alen; ++i)
+  {
     dtest_array[i] = dinit_val;
     itest_array[i] = iinit_val;
   }
-  
+
   working_res.memcpy(dworking_array, dtest_array, sizeof(double) * alen);
   working_res.memcpy(iworking_array, itest_array, sizeof(int) * alen);
 
-  const double drinit = 5.0;
-  const int irinit = 4;
+  const double drinit   = 5.0;
+  const int irinit      = 4;
   const int test_repeat = 4;
 
   RAJA::ReduceSum<REDUCE_POLICY, double> dsum0(drinit * 1.0);
@@ -79,47 +76,44 @@ void ForallIndexSetReduceSumMultipleTestImpl()
   RAJA::ReduceSum<REDUCE_POLICY, double> dsum2(drinit * 3.0);
   RAJA::ReduceSum<REDUCE_POLICY, int> isum3(irinit * 4);
 
-  for (int tcount = 1; tcount <= test_repeat; ++tcount) {
- 
-    RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-      dsum0 += 1.0 * dworking_array[idx];
-      isum1 += 2 * iworking_array[idx];
-      dsum2 += 3.0 * dworking_array[idx];
-      isum3 += 4 * iworking_array[idx];
-    });
+  for (int tcount = 1; tcount <= test_repeat; ++tcount)
+  {
+
+    RAJA::forall<EXEC_POLICY>(iset,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                              {
+                                dsum0 += 1.0 * dworking_array[idx];
+                                isum1 += 2 * iworking_array[idx];
+                                dsum2 += 3.0 * dworking_array[idx];
+                                isum3 += 4 * iworking_array[idx];
+                              });
 
     double dchk_val = dinit_val * static_cast<double>(iset.getLength());
-    int ichk_val = iinit_val * static_cast<int>(iset.getLength());
+    int ichk_val    = iinit_val * static_cast<int>(iset.getLength());
 
-    ASSERT_FLOAT_EQ(static_cast<double>(dsum0.get()), 
-                               tcount * (1 * dchk_val) + (drinit * 1.0) );
-    ASSERT_EQ(static_cast<int>(isum1.get()), 
-                               tcount * (2 * ichk_val) + (irinit * 2) );
+    ASSERT_FLOAT_EQ(static_cast<double>(dsum0.get()),
+                    tcount * (1 * dchk_val) + (drinit * 1.0));
+    ASSERT_EQ(static_cast<int>(isum1.get()),
+              tcount * (2 * ichk_val) + (irinit * 2));
     ASSERT_FLOAT_EQ(static_cast<double>(dsum2.get()),
-                               tcount * (3 * dchk_val) + (drinit * 3.0) );
-    ASSERT_EQ(static_cast<int>(isum3.get()), 
-                               tcount * (4 * ichk_val) + (irinit * 4) );
-
+                    tcount * (3 * dchk_val) + (drinit * 3.0));
+    ASSERT_EQ(static_cast<int>(isum3.get()),
+              tcount * (4 * ichk_val) + (irinit * 4));
   }
 
-  deallocateForallTestData<double>(working_res,
-                                   dworking_array,
-                                   dcheck_array,
+  deallocateForallTestData<double>(working_res, dworking_array, dcheck_array,
                                    dtest_array);
 
-  deallocateForallTestData<int>(working_res,
-                                iworking_array,
-                                icheck_array,
+  deallocateForallTestData<int>(working_res, iworking_array, icheck_array,
                                 itest_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceSumMultipleTest);
 template <typename T>
 class ForallIndexSetReduceSumMultipleTest : public ::testing::Test
-{
-};
+{};
 
-TYPED_TEST_P(ForallIndexSetReduceSumMultipleTest, 
+TYPED_TEST_P(ForallIndexSetReduceSumMultipleTest,
              ReduceSumMultipleForallIndexSet)
 {
   using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
@@ -127,8 +121,8 @@ TYPED_TEST_P(ForallIndexSetReduceSumMultipleTest,
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceSumMultipleTestImpl<IDX_TYPE, WORKING_RES,
-                                          EXEC_POLICY, REDUCE_POLICY>();
+  ForallIndexSetReduceSumMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
+                                          REDUCE_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceSumMultipleTest,
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp
index bc5aec30d6..8e996e4a2c 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp
@@ -14,32 +14,31 @@
 #include <numeric>
 #include <random>
 
-template <typename IDX_TYPE, 
-          typename DATA_TYPE, typename WORKING_RES, 
-          typename EXEC_POLICY, typename REDUCE_POLICY>
-void ForallReduceMaxMultipleTestImpl(IDX_TYPE first, 
-                                     IDX_TYPE last)
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceMaxMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(last,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(-SHRT_MAX);
-  const DATA_TYPE big_val = 500;
-  
+  const DATA_TYPE big_val     = 500;
+
   static std::random_device rd;
   static std::mt19937 mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
-  static std::uniform_int_distribution<int> dist2(static_cast<int>(first), static_cast<int>(last) - 1);
+  static std::uniform_int_distribution<int> dist2(static_cast<int>(first),
+                                                  static_cast<int>(last) - 1);
 
   // Workaround for broken omp-target reduction interface.
   // This should be `max0;` not `max0(0);`
@@ -49,7 +48,8 @@ void ForallReduceMaxMultipleTestImpl(IDX_TYPE first,
   RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> max2(big_val);
 
   const int nOuterLoops = 2;
-  for (int l = 0; l < nOuterLoops; ++l) {
+  for (int l = 0; l < nOuterLoops; ++l)
+  {
 
     ASSERT_EQ(default_val, static_cast<DATA_TYPE>(max0.get()));
     ASSERT_EQ(default_val, static_cast<DATA_TYPE>(max1.get()));
@@ -58,61 +58,62 @@ void ForallReduceMaxMultipleTestImpl(IDX_TYPE first,
     DATA_TYPE current_max = default_val;
 
     const int nMiddleLoops = 2;
-    for (int k = 0; k < nMiddleLoops; ++k) {
+    for (int k = 0; k < nMiddleLoops; ++k)
+    {
 
-      for (IDX_TYPE i = 0; i < last; ++i) {
+      for (IDX_TYPE i = 0; i < last; ++i)
+      {
         test_array[i] = default_val;
       }
       working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
 
       const int nloops = 6;
-      for (int j = 0; j < nloops; ++j) {
+      for (int j = 0; j < nloops; ++j)
+      {
 
-        DATA_TYPE roll = static_cast<DATA_TYPE>( dist(mt) );
+        DATA_TYPE roll     = static_cast<DATA_TYPE>(dist(mt));
         IDX_TYPE max_index = static_cast<IDX_TYPE>(dist2(mt));
 
         test_array[max_index] = roll;
-        working_res.memcpy(&working_array[max_index], &test_array[max_index], sizeof(DATA_TYPE));
+        working_res.memcpy(&working_array[max_index], &test_array[max_index],
+                           sizeof(DATA_TYPE));
 
-        if ( current_max < roll ) {
-          current_max = roll ;
+        if (current_max < roll)
+        {
+          current_max = roll;
         }
 
-        RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-          max0.max(working_array[idx]);
-          max1.max(2 * working_array[idx]);
-          max2.max(working_array[idx]);
-        });
+        RAJA::forall<EXEC_POLICY>(r1,
+                                  [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                                  {
+                                    max0.max(working_array[idx]);
+                                    max1.max(2 * working_array[idx]);
+                                    max2.max(working_array[idx]);
+                                  });
 
         ASSERT_EQ(current_max, static_cast<DATA_TYPE>(max0.get()));
         ASSERT_EQ(current_max * 2, static_cast<DATA_TYPE>(max1.get()));
         ASSERT_EQ(big_val, static_cast<DATA_TYPE>(max2.get()));
-
       }
-
     }
 
     max0.reset(default_val);
     max1.reset(default_val);
     max2.reset(big_val);
-
   }
 
   ASSERT_EQ(default_val, static_cast<DATA_TYPE>(max0.get()));
   ASSERT_EQ(default_val, static_cast<DATA_TYPE>(max1.get()));
   ASSERT_EQ(big_val, static_cast<DATA_TYPE>(max2.get()));
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxMultipleTest);
 template <typename T>
 class ForallReduceMaxMultipleTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMaxMultipleTest, ReduceMaxMultipleForall)
 {
@@ -122,8 +123,8 @@ TYPED_TEST_P(ForallReduceMaxMultipleTest, ReduceMaxMultipleForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceMaxMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES,
-                                  EXEC_POLICY, REDUCE_POLICY>(0, 2115);
+  ForallReduceMaxMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                                  REDUCE_POLICY>(0, 2115);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxMultipleTest,
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp
index 8f16762989..d13f7f05a3 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp
@@ -14,40 +14,43 @@
 #include <numeric>
 #include <random>
 
-template <typename IDX_TYPE, 
-          typename DATA_TYPE, typename WORKING_RES, 
-          typename EXEC_POLICY, typename REDUCE_POLICY>
-void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, 
-                                        IDX_TYPE last)
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(last,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(-SHRT_MAX);
-  const IDX_TYPE default_loc = -1;
-  const DATA_TYPE big_val = 500;
-  
+  const IDX_TYPE default_loc  = -1;
+  const DATA_TYPE big_val     = 500;
+
   static std::random_device rd;
   static std::mt19937 mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
-  static std::uniform_int_distribution<int> dist2(static_cast<int>(first), static_cast<int>(last) - 1);
+  static std::uniform_int_distribution<int> dist2(static_cast<int>(first),
+                                                  static_cast<int>(last) - 1);
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max0(default_val, default_loc);
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max1(default_val, default_loc);
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max2(big_val, default_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max0(default_val,
+                                                              default_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max1(default_val,
+                                                              default_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max2(big_val,
+                                                              default_loc);
 
   const int nOuterLoops = 2;
-  for (int l = 0; l < nOuterLoops; ++l) {
+  for (int l = 0; l < nOuterLoops; ++l)
+  {
 
     ASSERT_EQ(default_val, static_cast<DATA_TYPE>(max0.get()));
     ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(max0.getLoc()));
@@ -59,37 +62,45 @@ void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first,
     ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(max2.getLoc()));
 
     DATA_TYPE current_max = default_val;
-    IDX_TYPE  current_loc = default_loc;
+    IDX_TYPE current_loc  = default_loc;
 
     const int nMiddleLoops = 2;
-    for (int k = 0; k < nMiddleLoops; ++k) {
+    for (int k = 0; k < nMiddleLoops; ++k)
+    {
 
-      for (IDX_TYPE i = first; i < last; ++i) {
+      for (IDX_TYPE i = first; i < last; ++i)
+      {
         test_array[i] = default_val;
       }
       working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
 
       const int nloops = 6;
-      for (int j = 0; j < nloops; ++j) {
+      for (int j = 0; j < nloops; ++j)
+      {
 
-        DATA_TYPE roll = static_cast<DATA_TYPE>( dist(mt) );
+        DATA_TYPE roll     = static_cast<DATA_TYPE>(dist(mt));
         IDX_TYPE max_index = static_cast<IDX_TYPE>(dist2(mt));
 
-        if ( current_max != roll ) { // avoid two indices getting the same value
+        if (current_max != roll)
+        {  // avoid two indices getting the same value
           test_array[max_index] = roll;
-          working_res.memcpy(&working_array[max_index], &test_array[max_index], sizeof(DATA_TYPE));
+          working_res.memcpy(&working_array[max_index], &test_array[max_index],
+                             sizeof(DATA_TYPE));
 
-          if ( current_max < roll ) {
+          if (current_max < roll)
+          {
             current_max = roll;
             current_loc = max_index;
           }
         }
 
-        RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-          max0.maxloc(working_array[idx], idx);
-          max1.maxloc(2 * working_array[idx], idx);
-          max2.maxloc(working_array[idx], idx);
-        });
+        RAJA::forall<EXEC_POLICY>(r1,
+                                  [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                                  {
+                                    max0.maxloc(working_array[idx], idx);
+                                    max1.maxloc(2 * working_array[idx], idx);
+                                    max2.maxloc(working_array[idx], idx);
+                                  });
 
         ASSERT_EQ(current_max, static_cast<DATA_TYPE>(max0.get()));
         ASSERT_EQ(current_loc, static_cast<IDX_TYPE>(max0.getLoc()));
@@ -99,15 +110,12 @@ void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first,
 
         ASSERT_EQ(big_val, static_cast<DATA_TYPE>(max2.get()));
         ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(max2.getLoc()));
-
       }
-
     }
 
     max0.reset(default_val, default_loc);
     max1.reset(default_val, default_loc);
     max2.reset(big_val, default_loc);
-
   }
 
   ASSERT_EQ(default_val, static_cast<DATA_TYPE>(max0.get()));
@@ -119,17 +127,14 @@ void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first,
   ASSERT_EQ(big_val, static_cast<DATA_TYPE>(max2.get()));
   ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(max2.getLoc()));
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxLocMultipleTest);
 template <typename T>
 class ForallReduceMaxLocMultipleTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMaxLocMultipleTest, ReduceMaxLocMultipleForall)
 {
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp
index 7e51ac2a2d..a33710f7dc 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp
@@ -15,31 +15,30 @@
 #include <random>
 
 template <typename IDX_TYPE,
-          typename DATA_TYPE, typename WORKING_RES,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
-void ForallReduceMinMultipleTestImpl(IDX_TYPE first, 
-                                     IDX_TYPE last)
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceMinMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(last,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(SHRT_MAX);
-  const DATA_TYPE big_val = -500;
-  
+  const DATA_TYPE big_val     = -500;
+
   static std::random_device rd;
   static std::mt19937 mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
-  static std::uniform_int_distribution<int> dist2(static_cast<int>(first), static_cast<int>(last) - 1);
+  static std::uniform_int_distribution<int> dist2(static_cast<int>(first),
+                                                  static_cast<int>(last) - 1);
 
   // Workaround for broken omp-target reduction interface.
   // This should be `min0;` not `min0(0);`
@@ -49,7 +48,8 @@ void ForallReduceMinMultipleTestImpl(IDX_TYPE first,
   RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> min2(big_val);
 
   const int nOuterLoops = 2;
-  for (int l = 0; l < nOuterLoops; ++l) {
+  for (int l = 0; l < nOuterLoops; ++l)
+  {
 
     ASSERT_EQ(default_val, static_cast<DATA_TYPE>(min0.get()));
     ASSERT_EQ(default_val, static_cast<DATA_TYPE>(min1.get()));
@@ -58,61 +58,62 @@ void ForallReduceMinMultipleTestImpl(IDX_TYPE first,
     DATA_TYPE current_min = default_val;
 
     const int nMiddleLoops = 2;
-    for (int k = 0; k < nMiddleLoops; ++k) {
+    for (int k = 0; k < nMiddleLoops; ++k)
+    {
 
-      for (IDX_TYPE i = 0; i < last; ++i) {
+      for (IDX_TYPE i = 0; i < last; ++i)
+      {
         test_array[i] = default_val;
       }
       working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
 
       const int nloops = 6;
-      for (int j = 0; j < nloops; ++j) {
+      for (int j = 0; j < nloops; ++j)
+      {
 
-        DATA_TYPE roll = static_cast<DATA_TYPE>( dist(mt) );
+        DATA_TYPE roll     = static_cast<DATA_TYPE>(dist(mt));
         IDX_TYPE min_index = static_cast<IDX_TYPE>(dist2(mt));
 
         test_array[min_index] = roll;
-        working_res.memcpy(&working_array[min_index], &test_array[min_index], sizeof(DATA_TYPE));
+        working_res.memcpy(&working_array[min_index], &test_array[min_index],
+                           sizeof(DATA_TYPE));
 
-        if ( current_min > roll ) {
-          current_min = roll ;
+        if (current_min > roll)
+        {
+          current_min = roll;
         }
 
-        RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-          min0.min(working_array[idx]);
-          min1.min(2 * working_array[idx]);
-          min2.min(working_array[idx]);
-        });
+        RAJA::forall<EXEC_POLICY>(r1,
+                                  [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                                  {
+                                    min0.min(working_array[idx]);
+                                    min1.min(2 * working_array[idx]);
+                                    min2.min(working_array[idx]);
+                                  });
 
         ASSERT_EQ(current_min, static_cast<DATA_TYPE>(min0.get()));
         ASSERT_EQ(current_min * 2, static_cast<DATA_TYPE>(min1.get()));
         ASSERT_EQ(big_val, static_cast<DATA_TYPE>(min2.get()));
-
       }
-
     }
 
     min0.reset(default_val);
     min1.reset(default_val);
     min2.reset(big_val);
-
   }
 
   ASSERT_EQ(default_val, static_cast<DATA_TYPE>(min0.get()));
   ASSERT_EQ(default_val, static_cast<DATA_TYPE>(min1.get()));
   ASSERT_EQ(big_val, static_cast<DATA_TYPE>(min2.get()));
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinMultipleTest);
 template <typename T>
 class ForallReduceMinMultipleTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMinMultipleTest, ReduceMinMultipleForall)
 {
@@ -122,8 +123,8 @@ TYPED_TEST_P(ForallReduceMinMultipleTest, ReduceMinMultipleForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceMinMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
-                                  EXEC_POLICY, REDUCE_POLICY>(0, 2115);
+  ForallReduceMinMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                                  REDUCE_POLICY>(0, 2115);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinMultipleTest,
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp
index d71f582ed9..c8e4431ac4 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp
@@ -15,40 +15,43 @@
 #include <random>
 
 template <typename IDX_TYPE,
-          typename DATA_TYPE, typename WORKING_RES,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
-void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, 
-                                        IDX_TYPE last)
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(last,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(SHRT_MAX);
-  const IDX_TYPE default_loc = -1;
-  const DATA_TYPE big_val = -500;
+  const IDX_TYPE default_loc  = -1;
+  const DATA_TYPE big_val     = -500;
 
   static std::random_device rd;
   static std::mt19937 mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
-  static std::uniform_int_distribution<int> dist2(static_cast<int>(first), static_cast<int>(last) - 1);
+  static std::uniform_int_distribution<int> dist2(static_cast<int>(first),
+                                                  static_cast<int>(last) - 1);
 
   printf("min0 init { %f, %f }\n", (double)default_val, (double)default_loc);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min0(default_val, default_loc);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min1(default_val, default_loc);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min2(big_val, default_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min0(default_val,
+                                                              default_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min1(default_val,
+                                                              default_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min2(big_val,
+                                                              default_loc);
 
   const int nOuterLoops = 2;
-  for (int l = 0; l < nOuterLoops; ++l) {
+  for (int l = 0; l < nOuterLoops; ++l)
+  {
 
     printf("min0 { %f, %f }\n", (double)min0.get(), (double)min0.getLoc());
     ASSERT_EQ(default_val, static_cast<DATA_TYPE>(min0.get()));
@@ -61,40 +64,49 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first,
     ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(min2.getLoc()));
 
     DATA_TYPE current_min = default_val;
-    IDX_TYPE  current_loc = default_loc;
+    IDX_TYPE current_loc  = default_loc;
 
     const int nMiddleLoops = 2;
-    for (int k = 0; k < nMiddleLoops; ++k) {
+    for (int k = 0; k < nMiddleLoops; ++k)
+    {
 
       printf("reset data { %f }\n", (double)default_val);
-      for (IDX_TYPE i = first; i < last; ++i) {
+      for (IDX_TYPE i = first; i < last; ++i)
+      {
         test_array[i] = default_val;
       }
       working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
 
       const int nloops = 6;
-      for (int j = 0; j < nloops; ++j) {
+      for (int j = 0; j < nloops; ++j)
+      {
 
-        DATA_TYPE roll = static_cast<DATA_TYPE>( dist(mt) );
+        DATA_TYPE roll     = static_cast<DATA_TYPE>(dist(mt));
         IDX_TYPE min_index = static_cast<IDX_TYPE>(dist2(mt));
 
         printf("rolling { %f, %f }\n", (double)roll, (double)min_index);
-        if ( current_min != roll ) { // avoid two indices getting the same value
+        if (current_min != roll)
+        {  // avoid two indices getting the same value
           test_array[min_index] = roll;
-          working_res.memcpy(&working_array[min_index], &test_array[min_index], sizeof(DATA_TYPE));
+          working_res.memcpy(&working_array[min_index], &test_array[min_index],
+                             sizeof(DATA_TYPE));
 
-          if ( current_min > roll ) {
+          if (current_min > roll)
+          {
             current_min = roll;
             current_loc = min_index;
           }
         }
-        printf("current { %f, %f }\n", (double)current_min, (double)current_loc);
+        printf("current { %f, %f }\n", (double)current_min,
+               (double)current_loc);
 
-        RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-          min0.minloc(working_array[idx], idx);
-          min1.minloc(2 * working_array[idx], idx);
-          min2.minloc(working_array[idx], idx);
-        });
+        RAJA::forall<EXEC_POLICY>(r1,
+                                  [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                                  {
+                                    min0.minloc(working_array[idx], idx);
+                                    min1.minloc(2 * working_array[idx], idx);
+                                    min2.minloc(working_array[idx], idx);
+                                  });
 
         printf("min0 { %f, %f }\n", (double)min0.get(), (double)min0.getLoc());
         ASSERT_EQ(current_min, static_cast<DATA_TYPE>(min0.get()));
@@ -105,16 +117,13 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first,
 
         ASSERT_EQ(big_val, static_cast<DATA_TYPE>(min2.get()));
         ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(min2.getLoc()));
-
       }
-
     }
 
     printf("min0 reset { %f, %f }\n", (double)default_val, (double)default_loc);
     min0.reset(default_val, (DATA_TYPE)default_loc);
     min1.reset(default_val, default_loc);
     min2.reset(big_val, default_loc);
-
   }
 
   printf("min0 { %f, %f }\n", (double)min0.get(), (double)min0.getLoc());
@@ -127,17 +136,14 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first,
   ASSERT_EQ(big_val, static_cast<DATA_TYPE>(min2.get()));
   ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(min2.getLoc()));
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinLocMultipleTest);
 template <typename T>
 class ForallReduceMinLocMultipleTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMinLocMultipleTest, ReduceMinLocMultipleForall)
 {
@@ -147,7 +153,7 @@ TYPED_TEST_P(ForallReduceMinLocMultipleTest, ReduceMinLocMultipleForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceMinLocMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+  ForallReduceMinLocMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES,
                                      EXEC_POLICY, REDUCE_POLICY>(0, 2115);
 }
 
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp
index b5a6c469d1..aa489187f0 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp
@@ -12,27 +12,26 @@
 #include <numeric>
 
 template <typename IDX_TYPE,
-          typename DATA_TYPE, typename WORKING_RES,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
-void ForallReduceSumMultipleStaggeredTestImpl(IDX_TYPE first, 
-                                              IDX_TYPE last)
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceSumMultipleStaggeredTestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(last,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const DATA_TYPE initval = 2;
 
-  for (IDX_TYPE i = first; i < last; ++i) {
+  for (IDX_TYPE i = first; i < last; ++i)
+  {
     test_array[i] = initval;
   }
 
@@ -51,60 +50,63 @@ void ForallReduceSumMultipleStaggeredTestImpl(IDX_TYPE first,
   const DATA_TYPE index_len = static_cast<DATA_TYPE>(last - first);
 
   const int nloops = 2;
-  for (int j = 0; j < nloops; ++j) {
-
-    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-      sum0 += working_array[idx];
-      sum1 += working_array[idx] * 2;
-      sum2 += working_array[idx] * 3;
-      sum3 += working_array[idx] * 4;
-      sum4 += working_array[idx] * 5;
-      sum5 += working_array[idx] * 6;
-      sum6 += working_array[idx] * 7;
-      sum7 += working_array[idx] * 8;
-    });
+  for (int j = 0; j < nloops; ++j)
+  {
+
+    RAJA::forall<EXEC_POLICY>(r1,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                              {
+                                sum0 += working_array[idx];
+                                sum1 += working_array[idx] * 2;
+                                sum2 += working_array[idx] * 3;
+                                sum3 += working_array[idx] * 4;
+                                sum4 += working_array[idx] * 5;
+                                sum5 += working_array[idx] * 6;
+                                sum6 += working_array[idx] * 7;
+                                sum7 += working_array[idx] * 8;
+                              });
 
     DATA_TYPE check_val = initval * index_len * (j + 1);
 
     ASSERT_EQ(1 * check_val, static_cast<DATA_TYPE>(sum0.get()));
-    ASSERT_EQ(2 * check_val + (initval*1), static_cast<DATA_TYPE>(sum1.get()));
+    ASSERT_EQ(2 * check_val + (initval * 1),
+              static_cast<DATA_TYPE>(sum1.get()));
     ASSERT_EQ(3 * check_val, static_cast<DATA_TYPE>(sum2.get()));
-    ASSERT_EQ(4 * check_val + (initval*3), static_cast<DATA_TYPE>(sum3.get()));
+    ASSERT_EQ(4 * check_val + (initval * 3),
+              static_cast<DATA_TYPE>(sum3.get()));
     ASSERT_EQ(5 * check_val, static_cast<DATA_TYPE>(sum4.get()));
-    ASSERT_EQ(6 * check_val + (initval*5), static_cast<DATA_TYPE>(sum5.get()));
+    ASSERT_EQ(6 * check_val + (initval * 5),
+              static_cast<DATA_TYPE>(sum5.get()));
     ASSERT_EQ(7 * check_val, static_cast<DATA_TYPE>(sum6.get()));
-    ASSERT_EQ(8 * check_val + (initval*7), static_cast<DATA_TYPE>(sum7.get()));
-
+    ASSERT_EQ(8 * check_val + (initval * 7),
+              static_cast<DATA_TYPE>(sum7.get()));
   }
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 template <typename IDX_TYPE,
-          typename DATA_TYPE, typename WORKING_RES, 
-          typename EXEC_POLICY, typename REDUCE_POLICY>
-void ForallReduceSumMultipleStaggered2TestImpl(IDX_TYPE first, 
-			                       IDX_TYPE last)
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceSumMultipleStaggered2TestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(last,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const DATA_TYPE initval = 2;
 
-  for (IDX_TYPE i = first; i < last; ++i) {
+  for (IDX_TYPE i = first; i < last; ++i)
+  {
     test_array[i] = initval;
   }
 
@@ -134,43 +136,46 @@ void ForallReduceSumMultipleStaggered2TestImpl(IDX_TYPE first,
   sum7.reset(initval * 7);
 
   const int nloops = 3;
-  for (int j = 0; j < nloops; ++j) {
-
-    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-      sum0 += working_array[idx];
-      sum1 += working_array[idx] * 2;
-      sum2 += working_array[idx] * 3;
-      sum3 += working_array[idx] * 4;
-      sum4 += working_array[idx] * 5;
-      sum5 += working_array[idx] * 6;
-      sum6 += working_array[idx] * 7;
-      sum7 += working_array[idx] * 8;
-    });
+  for (int j = 0; j < nloops; ++j)
+  {
+
+    RAJA::forall<EXEC_POLICY>(r1,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                              {
+                                sum0 += working_array[idx];
+                                sum1 += working_array[idx] * 2;
+                                sum2 += working_array[idx] * 3;
+                                sum3 += working_array[idx] * 4;
+                                sum4 += working_array[idx] * 5;
+                                sum5 += working_array[idx] * 6;
+                                sum6 += working_array[idx] * 7;
+                                sum7 += working_array[idx] * 8;
+                              });
 
     DATA_TYPE check_val = initval * index_len * (j + 1);
 
     ASSERT_EQ(1 * check_val, static_cast<DATA_TYPE>(sum0.get()));
-    ASSERT_EQ(2 * check_val + (initval*1), static_cast<DATA_TYPE>(sum1.get()));
+    ASSERT_EQ(2 * check_val + (initval * 1),
+              static_cast<DATA_TYPE>(sum1.get()));
     ASSERT_EQ(3 * check_val, static_cast<DATA_TYPE>(sum2.get()));
-    ASSERT_EQ(4 * check_val + (initval*3), static_cast<DATA_TYPE>(sum3.get()));
+    ASSERT_EQ(4 * check_val + (initval * 3),
+              static_cast<DATA_TYPE>(sum3.get()));
     ASSERT_EQ(5 * check_val, static_cast<DATA_TYPE>(sum4.get()));
-    ASSERT_EQ(6 * check_val + (initval*5), static_cast<DATA_TYPE>(sum5.get()));
+    ASSERT_EQ(6 * check_val + (initval * 5),
+              static_cast<DATA_TYPE>(sum5.get()));
     ASSERT_EQ(7 * check_val, static_cast<DATA_TYPE>(sum6.get()));
-    ASSERT_EQ(8 * check_val + (initval*7), static_cast<DATA_TYPE>(sum7.get()));
-
+    ASSERT_EQ(8 * check_val + (initval * 7),
+              static_cast<DATA_TYPE>(sum7.get()));
   }
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceSumMultipleTest);
 template <typename T>
 class ForallReduceSumMultipleTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceSumMultipleTest, ReduceSumMultipleForall)
 {
@@ -180,11 +185,12 @@ TYPED_TEST_P(ForallReduceSumMultipleTest, ReduceSumMultipleForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceSumMultipleStaggeredTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+  ForallReduceSumMultipleStaggeredTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES,
                                            EXEC_POLICY, REDUCE_POLICY>(0, 2115);
 
-  ForallReduceSumMultipleStaggered2TestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
-                                            EXEC_POLICY, REDUCE_POLICY>(0, 2115);
+  ForallReduceSumMultipleStaggered2TestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES,
+                                            EXEC_POLICY, REDUCE_POLICY>(0,
+                                                                        2115);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumMultipleTest,
diff --git a/test/functional/forall/region/tests/test-forall-region.hpp b/test/functional/forall/region/tests/test-forall-region.hpp
index f83d9ef1a5..6b0dafd652 100644
--- a/test/functional/forall/region/tests/test-forall-region.hpp
+++ b/test/functional/forall/region/tests/test-forall-region.hpp
@@ -11,59 +11,54 @@
 #include <numeric>
 #include <vector>
 
-template <typename INDEX_TYPE, typename WORKING_RES, 
-          typename REG_POLICY, typename EXEC_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename REG_POLICY,
+          typename EXEC_POLICY>
 void ForallRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   //
   // Set some local variables and create some segments for using in tests
   //
   const INDEX_TYPE N = last - first;
-  
+
   RAJA::TypedRangeSegment<INDEX_TYPE> rseg(first, last);
 
   std::vector<INDEX_TYPE> idx_array(N);
   std::iota(&idx_array[0], &idx_array[0] + N, first);
 
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], N,
-                                          working_res);
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], N, working_res);
 
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
-
-  working_res.memset( working_array, 0, sizeof(INDEX_TYPE) * N );
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  RAJA::region<REG_POLICY>([=]() {
+  working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * N);
 
-    RAJA::forall<EXEC_POLICY>(rseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      working_array[idx - first] += 1;
-    });
+  RAJA::region<REG_POLICY>(
+      [=]()
+      {
+        RAJA::forall<EXEC_POLICY>(rseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                                  { working_array[idx - first] += 1; });
 
-    RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      working_array[idx - first] += 2; 
-    });
-
-  });
+        RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                                  { working_array[idx - first] += 2; });
+      });
 
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(check_array[i], 3);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -71,8 +66,7 @@ void ForallRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 TYPED_TEST_SUITE_P(ForallRegionTest);
 template <typename T>
 class ForallRegionTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallRegionTest, RegionForall)
 {
@@ -82,11 +76,12 @@ TYPED_TEST_P(ForallRegionTest, RegionForall)
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
   ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(0, 25);
-  ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(1, 153);
-  ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(3, 2556);
+  ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(1,
+                                                                         153);
+  ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(3,
+                                                                         2556);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallRegionTest,
-                            RegionForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallRegionTest, RegionForall);
 
 #endif  // __TEST_FORALL_REGION_HPP__
diff --git a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp
index b000b270da..f909ef9d4c 100644
--- a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp
+++ b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp
@@ -21,67 +21,63 @@ void ForallResourceIcountIndexSetTestImpl()
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
   using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
 
-  using IndexSetType = 
-   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
+  using IndexSetType =
+      RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
   WORKING_RES working_res;
-  camp::resources::Resource erased_working_res{working_res};
+  camp::resources::Resource erased_working_res {working_res};
 
-  IndexSetType iset; 
-  std::vector<INDEX_TYPE> is_indices; 
+  IndexSetType iset;
+  std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
-    iset, is_indices, erased_working_res);
+      iset, is_indices, erased_working_res);
 
   //
   // Working array length
   //
-  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
+  const INDEX_TYPE N = is_indices[is_indices.size() - 1] + 1;
 
   //
   // Allocate and initialize arrays used in testing
-  //  
+  //
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     erased_working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
+                                     &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
   INDEX_TYPE ticount = 0;
-  for (size_t i = 0; i < is_indices.size(); ++i) {
-    test_array[ ticount++ ] = is_indices[i];
+  for (size_t i = 0; i < is_indices.size(); ++i)
+  {
+    test_array[ticount++] = is_indices[i];
   }
 
-  RAJA::forall_Icount<EXEC_POLICY>(working_res, iset,
-    [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx) {
-    working_array[icount] = idx;
-  });
+  RAJA::forall_Icount<EXEC_POLICY>(
+      working_res, iset,
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx)
+      { working_array[icount] = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
+                                       check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallResourceIcountIndexSetTest);
 template <typename T>
 class ForallResourceIcountIndexSetTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallResourceIcountIndexSetTest, ResourceIndexSetForallIcount)
 {
@@ -89,7 +85,8 @@ TYPED_TEST_P(ForallResourceIcountIndexSetTest, ResourceIndexSetForallIcount)
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  ForallResourceIcountIndexSetTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
+  ForallResourceIcountIndexSetTestImpl<INDEX_TYPE, WORKING_RESOURCE,
+                                       EXEC_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallResourceIcountIndexSetTest,
diff --git a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp
index c1f714013d..2129d1350f 100644
--- a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp
+++ b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp
@@ -20,66 +20,62 @@ void ForallResourceIndexSetTestImpl()
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
   using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
 
-  using IndexSetType = 
-   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
+  using IndexSetType =
+      RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
   WORKING_RES working_res;
-  camp::resources::Resource erased_working_res{working_res};
+  camp::resources::Resource erased_working_res {working_res};
 
-  IndexSetType iset; 
-  std::vector<INDEX_TYPE> is_indices; 
+  IndexSetType iset;
+  std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
-    iset, is_indices, erased_working_res);
+      iset, is_indices, erased_working_res);
 
   //
   // Working array length
   //
-  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
+  const INDEX_TYPE N = is_indices[is_indices.size() - 1] + 1;
 
   //
   // Allocate and initialize arrays used in testing
-  //  
+  //
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     erased_working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
+                                     &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
-  for (size_t i = 0; i < is_indices.size(); ++i) {
-    test_array[ is_indices[i] ] = is_indices[i];
+  for (size_t i = 0; i < is_indices.size(); ++i)
+  {
+    test_array[is_indices[i]] = is_indices[i];
   }
 
-  RAJA::forall<EXEC_POLICY>(working_res, iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    working_array[idx] = idx;
-  });
+  RAJA::forall<EXEC_POLICY>(working_res, iset,
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            { working_array[idx] = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  // 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  //
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
+                                       check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallResourceIndexSetTest);
 template <typename T>
 class ForallResourceIndexSetTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallResourceIndexSetTest, ResourceIndexSetForall)
 {
@@ -90,7 +86,6 @@ TYPED_TEST_P(ForallResourceIndexSetTest, ResourceIndexSetForall)
   ForallResourceIndexSetTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallResourceIndexSetTest,
-                            ResourceIndexSetForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallResourceIndexSetTest, ResourceIndexSetForall);
 
 #endif  // __TEST_FORALL_INDEXSET_HPP__
diff --git a/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp b/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp
index 5e0675cc98..b5596ab1dd 100644
--- a/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp
+++ b/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp
@@ -22,67 +22,70 @@ void ForallResourceListSegmentTestImpl(INDEX_TYPE N)
   // Create and initialize indices in idx_array used to create list segment
   std::vector<INDEX_TYPE> idx_array;
 
-  srand ( time(NULL) );
+  srand(time(NULL));
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i)
+  {
     INDEX_TYPE randval = INDEX_TYPE(rand() % RAJA::stripIndexType(N));
-    if ( i < randval ) {
+    if (i < randval)
+    {
       idx_array.push_back(i);
-    }     
+    }
   }
 
   size_t idxlen = idx_array.size();
 
   WORKING_RES working_res;
-  camp::resources::Resource erased_working_res{working_res};
+  camp::resources::Resource erased_working_res {working_res};
 
   // Create list segment for tests
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen, 
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen,
                                           erased_working_res);
 
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     erased_working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
+                                     &check_array, &test_array);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
     test_array[RAJA::stripIndexType(i)] = INDEX_TYPE(0);
   }
 
-  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(working_array, test_array,
+                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
-  for (size_t i = 0; i < idxlen; ++i) {
-    test_array[ RAJA::stripIndexType(idx_array[i]) ] = idx_array[i];
+  for (size_t i = 0; i < idxlen; ++i)
+  {
+    test_array[RAJA::stripIndexType(idx_array[i])] = idx_array[i];
   }
 
-  RAJA::forall<EXEC_POLICY>(working_res, lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    working_array[RAJA::stripIndexType(idx)] = idx;
-  }); 
+  RAJA::forall<EXEC_POLICY>(working_res, lseg,
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+                              working_array[RAJA::stripIndexType(idx)] = idx;
+                            });
 
-  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(check_array, working_array,
+                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
-  // 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  //
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
+                                       check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallResourceListSegmentTest);
 template <typename T>
 class ForallResourceListSegmentTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallResourceListSegmentTest, ResourceListSegmentForall)
 {
@@ -90,11 +93,14 @@ TYPED_TEST_P(ForallResourceListSegmentTest, ResourceListSegmentForall)
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  ForallResourceListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(13));
+  ForallResourceListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
+      INDEX_TYPE(13));
 
-  ForallResourceListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(2047));
+  ForallResourceListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
+      INDEX_TYPE(2047));
 
-  ForallResourceListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(32000));
+  ForallResourceListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
+      INDEX_TYPE(32000));
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallResourceListSegmentTest,
diff --git a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp
index 83cc7c4aa1..321a0804fa 100644
--- a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp
+++ b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp
@@ -13,60 +13,66 @@
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallResourceRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
+                                         RAJA::stripIndexType(last));
   INDEX_TYPE N = INDEX_TYPE(r1.end() - r1.begin());
 
   WORKING_RES working_res;
-  camp::resources::Resource erased_working_res{working_res};
+  camp::resources::Resource erased_working_res {working_res};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     erased_working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
+                                     &check_array, &test_array);
 
   const INDEX_TYPE rbegin = *r1.begin();
 
   std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
 
-  RAJA::forall<EXEC_POLICY>(working_res, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
-  });
+  RAJA::forall<EXEC_POLICY>(
+      working_res, r1,
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+      { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
 
-  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(check_array, working_array,
+                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
+                                       check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallResourceRangeSegmentTest);
 template <typename T>
 class ForallResourceRangeSegmentTest : public ::testing::Test
-{
-};
+{};
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
-{
-}
+{}
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
-  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0));
-  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5));
+  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(0));
+  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(5));
 }
 
 
@@ -76,9 +82,12 @@ TYPED_TEST_P(ForallResourceRangeSegmentTest, ResourceRangeSegmentForall)
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(27));
-  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(2047));
-  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(32000));
+  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(27));
+  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(2047));
+  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(32000));
 
   runNegativeTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
 }
diff --git a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp
index f85f295548..37d3ebfbf3 100644
--- a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp
+++ b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp
@@ -8,100 +8,142 @@
 #ifndef __TEST_FORALL_RESOURCE_RANGESTRIDESEGMENT_HPP__
 #define __TEST_FORALL_RESOURCE_RANGESTRIDESEGMENT_HPP__
 
-template <typename INDEX_TYPE, typename DIFF_TYPE, 
-          typename WORKING_RES, typename EXEC_POLICY>
-void ForallResourceRangeStrideSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last, 
-                                      DIFF_TYPE stride)
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY>
+void ForallResourceRangeStrideSegmentTestImpl(INDEX_TYPE first,
+                                              INDEX_TYPE last,
+                                              DIFF_TYPE stride)
 {
-  RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
+  RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(
+      RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
   INDEX_TYPE N = INDEX_TYPE(r1.size());
 
   WORKING_RES working_res;
-  camp::resources::Resource erased_working_res{working_res};
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource erased_working_res {working_res};
+  camp::resources::Resource host_res {camp::resources::Host()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     erased_working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
+                                     &check_array, &test_array);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
     test_array[RAJA::stripIndexType(i)] = INDEX_TYPE(0);
   }
 
-  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N)); 
+  working_res.memcpy(working_array, test_array,
+                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
   INDEX_TYPE idx = first;
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
-    test_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
-    idx += stride; 
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i)
+  {
+    test_array[RAJA::stripIndexType((idx - first) / stride)] = idx;
+    idx += stride;
   }
 
-  RAJA::forall<EXEC_POLICY>(working_res, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    working_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
-  });
+  RAJA::forall<EXEC_POLICY>(
+      working_res, r1,
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+      { working_array[RAJA::stripIndexType((idx - first) / stride)] = idx; });
 
-  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(check_array, working_array,
+                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
+                                       check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallResourceRangeStrideSegmentTest);
 template <typename T>
 class ForallResourceRangeStrideSegmentTest : public ::testing::Test
-{
-};
-
-template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{};
+
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
-{
-}
-
-template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{}
+
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
 {
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
-
-// Test negative strides
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
+
+  // Test negative strides
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
 }
 
 
-TYPED_TEST_P(ForallResourceRangeStrideSegmentTest, ResourceRangeStrideSegmentForall)
+TYPED_TEST_P(ForallResourceRangeStrideSegmentTest,
+             ResourceRangeStrideSegmentForall)
 {
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DIFF_TYPE   = typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
-
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
-
-// Test size zero segments
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
+  using DIFF_TYPE =
+      typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
+
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
+
+  // Test size zero segments
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
 
   runNegativeStrideTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>();
 }
diff --git a/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp b/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp
index e673abf306..93c2e1c07d 100644
--- a/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp
@@ -23,49 +23,49 @@ void ForallListSegmentViewTestImpl(INDEX_TYPE N)
   // Create and initialize indices in idx_array used to create list segment
   std::vector<INDEX_TYPE> idx_array;
 
-  srand ( time(NULL) );
+  srand(time(NULL));
 
-  for (INDEX_TYPE i = 0; i < N; ++i) {
+  for (INDEX_TYPE i = 0; i < N; ++i)
+  {
     INDEX_TYPE randval = rand() % N;
-    if ( i < randval ) {
+    if (i < randval)
+    {
       idx_array.push_back(i);
-    }     
+    }
   }
 
   size_t idxlen = idx_array.size();
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen, 
-                                          working_res);
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen, working_res);
 
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
-  for (size_t i = 0; i < idxlen; ++i) {
-    test_array[ idx_array[i] ] = idx_array[i];
+  for (size_t i = 0; i < idxlen; ++i)
+  {
+    test_array[idx_array[i]] = idx_array[i];
   }
 
   using layout_type = RAJA::Layout<1, INDEX_TYPE, 0>;
-  using view_type = RAJA::View< INDEX_TYPE, layout_type >;
-#if (!(defined(_GLIBCXX_RELEASE) || defined(RAJA_COMPILER_INTEL) || defined(RAJA_COMPILER_MSVC)))\
-    || _GLIBCXX_RELEASE >= 20150716
-  #if (__GNUG__ && __GNUC__ < 5)
-  #define IS_TRIVIALLY_COPYABLE(T) __has_trivial_copy(T)
-  #else
-  #define IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable<T>::value
-  #endif
+  using view_type   = RAJA::View<INDEX_TYPE, layout_type>;
+#if (!(defined(_GLIBCXX_RELEASE) || defined(RAJA_COMPILER_INTEL) ||            \
+       defined(RAJA_COMPILER_MSVC))) ||                                        \
+    _GLIBCXX_RELEASE >= 20150716
+#if (__GNUG__ && __GNUC__ < 5)
+#define IS_TRIVIALLY_COPYABLE(T) __has_trivial_copy(T)
+#else
+#define IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable<T>::value
+#endif
   static_assert(IS_TRIVIALLY_COPYABLE(layout_type),
                 "These layouts should always be triviallly copyable");
 
@@ -77,23 +77,21 @@ void ForallListSegmentViewTestImpl(INDEX_TYPE N)
 
 
 #endif
-  
+
   RAJA::Layout<1> layout(N);
   view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    work_view( idx ) = idx;
-  });
+  RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            { work_view(idx) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -104,69 +102,64 @@ void ForallListSegmentOffsetViewTestImpl(INDEX_TYPE N, INDEX_TYPE offset)
   // Create and initialize indices in idx_array used to create list segment
   std::vector<INDEX_TYPE> idx_array;
 
-  srand ( time(NULL) );
+  srand(time(NULL));
 
-  for (INDEX_TYPE i = 0; i < N; ++i) {
+  for (INDEX_TYPE i = 0; i < N; ++i)
+  {
     INDEX_TYPE randval = rand() % N;
-    if ( i < randval ) {
-      idx_array.push_back(i+offset);
-    }     
+    if (i < randval)
+    {
+      idx_array.push_back(i + offset);
+    }
   }
 
   size_t idxlen = idx_array.size();
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen, 
-                                          working_res);
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen, working_res);
 
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
-  for (size_t i = 0; i < idxlen; ++i) {
-    test_array[ idx_array[i]-offset ] = idx_array[i];
+  for (size_t i = 0; i < idxlen; ++i)
+  {
+    test_array[idx_array[i] - offset] = idx_array[i];
   }
 
   using layout_type = RAJA::OffsetLayout<1, INDEX_TYPE>;
-  using view_type = RAJA::View< INDEX_TYPE, layout_type >;
+  using view_type   = RAJA::View<INDEX_TYPE, layout_type>;
 
   INDEX_TYPE N_offset = N + offset;
-  view_type work_view(working_array, 
-                      RAJA::make_offset_layout<1, INDEX_TYPE>( {{offset}}, 
-                                                               {{N_offset}} ));
+  view_type work_view(working_array, RAJA::make_offset_layout<1, INDEX_TYPE>(
+                                         {{offset}}, {{N_offset}}));
 
-  RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    work_view( idx ) = idx;
-  });
+  RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            { work_view(idx) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallListSegmentViewTest);
 template <typename T>
 class ForallListSegmentViewTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallListSegmentViewTest, ListSegmentForallView)
 {
@@ -175,15 +168,19 @@ TYPED_TEST_P(ForallListSegmentViewTest, ListSegmentForallView)
   using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
 
   ForallListSegmentViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(13);
-  ForallListSegmentViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(2047);
-  ForallListSegmentViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(32000);
-
-  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(13, 1);
-  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(2047, 2);
-  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(32000, 3);
+  ForallListSegmentViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
+      2047);
+  ForallListSegmentViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
+      32000);
+
+  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE,
+                                      EXEC_POLICY>(13, 1);
+  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE,
+                                      EXEC_POLICY>(2047, 2);
+  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE,
+                                      EXEC_POLICY>(32000, 3);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallListSegmentViewTest,
-                            ListSegmentForallView);
+REGISTER_TYPED_TEST_SUITE_P(ForallListSegmentViewTest, ListSegmentForallView);
 
 #endif  // __TEST_FORALL_LISTSEGMENTVIEW_HPP__
diff --git a/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp b/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp
index b9355d9bc1..96956fd981 100644
--- a/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp
@@ -15,43 +15,41 @@ template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallRangeSegment2DViewTestImpl(INDEX_TYPE N)
 {
   INDEX_TYPE lentot = N * N;
-  const int NDIMS = 2;
+  const int NDIMS   = 2;
 
   RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, lentot);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(lentot,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(lentot, working_res, &working_array,
+                                     &check_array, &test_array);
 
   std::iota(test_array, test_array + lentot, 0);
 
-  using view_type = RAJA::View< INDEX_TYPE, RAJA::Layout<NDIMS> >;
+  using view_type = RAJA::View<INDEX_TYPE, RAJA::Layout<NDIMS>>;
   RAJA::Layout<NDIMS> layout(N, N);
-  
+
   view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    const INDEX_TYPE row = idx / N;
-    const INDEX_TYPE col = idx % N;
-    work_view(row, col) = row * N + col;
-  });
+  RAJA::forall<EXEC_POLICY>(r1,
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            {
+                              const INDEX_TYPE row = idx / N;
+                              const INDEX_TYPE col = idx % N;
+                              work_view(row, col)  = row * N + col;
+                            });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * lentot);
 
-  for (INDEX_TYPE i = 0; i < lentot; i++) {
+  for (INDEX_TYPE i = 0; i < lentot; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -59,75 +57,81 @@ template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallRangeSegment2DOffsetViewTestImpl(INDEX_TYPE N)
 {
   const INDEX_TYPE leninterior = N * N;
-  const INDEX_TYPE lentot = (N + 2) * (N + 2);
-  const int NDIMS = 2;
+  const INDEX_TYPE lentot      = (N + 2) * (N + 2);
+  const int NDIMS              = 2;
 
   RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, leninterior);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(lentot,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(lentot, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * lentot ); 
+  memset(test_array, 0, sizeof(INDEX_TYPE) * lentot);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * lentot);
 
-  for (int row = 1; row < N + 1; ++row) {
-    for (int col = 1; col < N + 1; ++col) {
-      int idx = row * (N+2) + col;
-      test_array[ idx ] = (row - 1) * N + (col - 1);
+  for (int row = 1; row < N + 1; ++row)
+  {
+    for (int col = 1; col < N + 1; ++col)
+    {
+      int idx         = row * (N + 2) + col;
+      test_array[idx] = (row - 1) * N + (col - 1);
     }
   }
 
-  using view_type = RAJA::View< INDEX_TYPE, RAJA::OffsetLayout<NDIMS> >;
+  using view_type = RAJA::View<INDEX_TYPE, RAJA::OffsetLayout<NDIMS>>;
   RAJA::OffsetLayout<NDIMS> layout =
-    RAJA::make_offset_layout<NDIMS>( {{-1, -1}} , {{N+1, N+1}} );
+      RAJA::make_offset_layout<NDIMS>({{-1, -1}}, {{N + 1, N + 1}});
 
   view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    const INDEX_TYPE row = idx / N;
-    const INDEX_TYPE col = idx % N;
-    work_view(row, col) = idx;  
-  });
+  RAJA::forall<EXEC_POLICY>(r1,
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            {
+                              const INDEX_TYPE row = idx / N;
+                              const INDEX_TYPE col = idx % N;
+                              work_view(row, col)  = idx;
+                            });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * lentot);
 
-  for (INDEX_TYPE i = 0; i < lentot; i++) {
+  for (INDEX_TYPE i = 0; i < lentot; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallRangeSegment2DViewTest);
 template <typename T>
 class ForallRangeSegment2DViewTest : public ::testing::Test
-{
-};
+{};
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* =
+              nullptr>
 void runOffsetViewTests()
-{
-}
+{}
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
 void runOffsetViewTests()
 {
-  ForallRangeSegment2DOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(4);
-  ForallRangeSegment2DOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(100);
+  ForallRangeSegment2DOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      4);
+  ForallRangeSegment2DOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      100);
 }
 
 
diff --git a/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp b/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp
index b4449db822..0c981f3da9 100644
--- a/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp
@@ -16,112 +16,109 @@ void ForallRangeSegmentViewTestImpl(INDEX_TYPE first, INDEX_TYPE last)
   RAJA::TypedRangeSegment<INDEX_TYPE> r1(first, last);
   INDEX_TYPE N = r1.end() - r1.begin();
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
   const INDEX_TYPE rbegin = *r1.begin();
 
   std::iota(test_array, test_array + N, rbegin);
 
-  using view_type = RAJA::View< INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0> >;
- 
+  using view_type = RAJA::View<INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0>>;
+
   RAJA::Layout<1> layout(N);
   view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    work_view( idx - rbegin ) = idx;
-  }); 
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            { work_view(idx - rbegin) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void ForallRangeSegmentOffsetViewTestImpl(INDEX_TYPE first, INDEX_TYPE last, 
+void ForallRangeSegmentOffsetViewTestImpl(INDEX_TYPE first,
+                                          INDEX_TYPE last,
                                           INDEX_TYPE offset)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(first+offset, last+offset);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(first + offset, last + offset);
   INDEX_TYPE N = r1.end() - r1.begin();
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
   const INDEX_TYPE rbegin = *r1.begin();
 
   std::iota(test_array, test_array + N, rbegin);
 
-  using view_type = RAJA::View< INDEX_TYPE, RAJA::OffsetLayout<1, INDEX_TYPE> >;
+  using view_type = RAJA::View<INDEX_TYPE, RAJA::OffsetLayout<1, INDEX_TYPE>>;
 
   INDEX_TYPE f_offset = first + offset;
   INDEX_TYPE l_offset = last + offset;
-  view_type work_view(working_array, 
-                      RAJA::make_offset_layout<1, INDEX_TYPE>({{f_offset}},
-                                                              {{l_offset}}));
+  view_type work_view(working_array, RAJA::make_offset_layout<1, INDEX_TYPE>(
+                                         {{f_offset}}, {{l_offset}}));
 
-  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    work_view( idx ) = idx;
-  });
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            { work_view(idx) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* =
+              nullptr>
 void runNegativeViewTests()
-{
-}
+{}
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
 void runNegativeViewTests()
 {
   ForallRangeSegmentViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(-5, 0);
   ForallRangeSegmentViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(-5, 5);
 
-  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(-5, 0, 1);
-  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(-5, 5, 2);
-  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(0, 10, -5);
+  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      -5, 0, 1);
+  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      -5, 5, 2);
+  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      0, 10, -5);
 }
 
 
 TYPED_TEST_SUITE_P(ForallRangeSegmentViewTest);
 template <typename T>
 class ForallRangeSegmentViewTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallRangeSegmentViewTest, RangeSegmentForallView)
 {
@@ -133,14 +130,16 @@ TYPED_TEST_P(ForallRangeSegmentViewTest, RangeSegmentForallView)
   ForallRangeSegmentViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(1, 5);
   ForallRangeSegmentViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(1, 255);
 
-  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(0, 5, 1);
-  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(1, 5, 2);
-  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(1, 255, 3);
+  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      0, 5, 1);
+  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      1, 5, 2);
+  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      1, 255, 3);
 
   runNegativeViewTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallRangeSegmentViewTest,
-                            RangeSegmentForallView);
+REGISTER_TYPED_TEST_SUITE_P(ForallRangeSegmentViewTest, RangeSegmentForallView);
 
 #endif  // __TEST_FORALL_RANGESEGMENTVIEW_HPP__
diff --git a/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp b/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp
index c385b929bc..2e56fab16c 100644
--- a/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp
@@ -8,80 +8,90 @@
 #ifndef __TEST_FORALL_RANGESTRIDESEGMENTVIEW_HPP__
 #define __TEST_FORALL_RANGESTRIDESEGMENTVIEW_HPP__
 
-template <typename INDEX_TYPE, typename DIFF_TYPE, 
-          typename WORKING_RES, typename EXEC_POLICY>
-void ForallRangeStrideSegmentViewTestImpl(INDEX_TYPE first, INDEX_TYPE last, 
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY>
+void ForallRangeStrideSegmentViewTestImpl(INDEX_TYPE first,
+                                          INDEX_TYPE last,
                                           DIFF_TYPE stride)
 {
   RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(first, last, stride);
   INDEX_TYPE N = r1.size();
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * N );
+  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
   INDEX_TYPE idx = first;
-  for (INDEX_TYPE i = 0; i < N; ++i) {
-    test_array[ (idx-first)/stride ] = idx;
+  for (INDEX_TYPE i = 0; i < N; ++i)
+  {
+    test_array[(idx - first) / stride] = idx;
     idx += stride;
   }
 
-  using view_type = RAJA::View< INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0> >;
+  using view_type = RAJA::View<INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0>>;
 
   RAJA::Layout<1> layout(N);
   view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    work_view( (idx-first)/stride ) = idx;
-  });
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            { work_view((idx - first) / stride) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
-template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* =
+              nullptr>
 void runNegativeIndexViewTests()
-{
-}
-
-template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
+{}
+
+template <
+    typename INDEX_TYPE,
+    typename DIFF_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
 void runNegativeIndexViewTests()
 {
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(-10, -1, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(-5, 0, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(-5, 5, 3);
-
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(10, -1, -1);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(10, 0, -2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(-10, -1, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(-5, 0, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(-5, 5, 3);
+
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(10, -1, -1);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(10, 0, -2);
 }
 
 
 TYPED_TEST_SUITE_P(ForallRangeStrideSegmentViewTest);
 template <typename T>
 class ForallRangeStrideSegmentViewTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallRangeStrideSegmentViewTest, RangeStrideSegmentForallView)
 {
@@ -90,17 +100,26 @@ TYPED_TEST_P(ForallRangeStrideSegmentViewTest, RangeStrideSegmentForallView)
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
   using DIFF_TYPE   = typename std::make_signed<INDEX_TYPE>::type;
 
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(0, 20, 1);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 20, 1);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(0, 20, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 20, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(0, 21, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 21, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 255, 2);
-
-// Test size zero segments
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(0, 20, -2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 20, -2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(0, 20, 1);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(1, 20, 1);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(0, 20, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(1, 20, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(0, 21, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(1, 21, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(1, 255, 2);
+
+  // Test size zero segments
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(0, 20, -2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(1, 20, -2);
 
   runNegativeIndexViewTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>();
 }
diff --git a/test/functional/forall/segment/tests/test-forall-ListSegment.hpp b/test/functional/forall/segment/tests/test-forall-ListSegment.hpp
index 0252af8644..df2f4300c8 100644
--- a/test/functional/forall/segment/tests/test-forall-ListSegment.hpp
+++ b/test/functional/forall/segment/tests/test-forall-ListSegment.hpp
@@ -22,76 +22,83 @@ void ForallListSegmentTestImpl(INDEX_TYPE N)
   // Create and initialize indices in idx_array used to create list segment
   std::vector<INDEX_TYPE> idx_array;
 
-  srand ( time(NULL) );
+  srand(time(NULL));
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i)
+  {
     INDEX_TYPE randval = INDEX_TYPE(rand() % RAJA::stripIndexType(N));
-    if ( i < randval ) {
+    if (i < randval)
+    {
       idx_array.push_back(i);
-    }     
+    }
   }
 
   size_t idxlen = idx_array.size();
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   // Create list segment for tests
   INDEX_TYPE* idx_vals = nullptr;
-  if (N > 0) {
+  if (N > 0)
+  {
     idx_vals = &idx_array[0];
   }
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(idx_vals, idxlen, 
-                                          working_res);
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(idx_vals, idxlen, working_res);
 
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
-    for (size_t i = 0; i < idxlen; ++i) {
-      test_array[ RAJA::stripIndexType(idx_vals[i]) ] = idx_vals[i];
+    for (size_t i = 0; i < idxlen; ++i)
+    {
+      test_array[RAJA::stripIndexType(idx_vals[i])] = idx_vals[i];
     }
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
-
-    RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      working_array[RAJA::stripIndexType(idx)] = idx;
-    }); 
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
-  } else { // zero-length segment
+    RAJA::forall<EXEC_POLICY>(lseg,
+                              [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+                                working_array[RAJA::stripIndexType(idx)] = idx;
+                              });
+  }
+  else
+  {  // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
-
-    RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      (void) idx;
-      working_array[0]++;
-    });
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
+    RAJA::forall<EXEC_POLICY>(lseg,
+                              [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                              {
+                                (void)idx;
+                                working_array[0]++;
+                              });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -99,8 +106,7 @@ void ForallListSegmentTestImpl(INDEX_TYPE N)
 TYPED_TEST_SUITE_P(ForallListSegmentTest);
 template <typename T>
 class ForallListSegmentTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallListSegmentTest, ListSegmentForall)
 {
@@ -109,16 +115,19 @@ TYPED_TEST_P(ForallListSegmentTest, ListSegmentForall)
   using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // test zero-length list segment
-  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(0));
+  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
+      INDEX_TYPE(0));
 
-  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(13));
+  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
+      INDEX_TYPE(13));
 
-  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(2047));
+  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
+      INDEX_TYPE(2047));
 
-  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(32000));
+  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
+      INDEX_TYPE(32000));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallListSegmentTest,
-                            ListSegmentForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallListSegmentTest, ListSegmentForall);
 
 #endif  // __TEST_FORALL_LISTSEGMENT_HPP__
diff --git a/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp b/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp
index 8b10d5dc10..a55a655788 100644
--- a/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp
+++ b/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp
@@ -14,57 +14,60 @@
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
+                                         RAJA::stripIndexType(last));
   INDEX_TYPE N = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     const INDEX_TYPE rbegin = *r1.begin();
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
 
-    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
-    });
-
-  } else { // zero-length segment 
+    RAJA::forall<EXEC_POLICY>(
+        r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+        { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
+  }
+  else
+  {  // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
-
-    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      (void) idx;
-      working_array[0]++;
-    });
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
+    RAJA::forall<EXEC_POLICY>(r1,
+                              [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                              {
+                                (void)idx;
+                                working_array[0]++;
+                              });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -72,24 +75,31 @@ void ForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 TYPED_TEST_SUITE_P(ForallRangeSegmentTest);
 template <typename T>
 class ForallRangeSegmentTest : public ::testing::Test
-{
-};
+{};
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
-{
-}
+{}
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
-  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(-5));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(-5));
 
-  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0));
-  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(0));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(5));
 }
 
 
@@ -100,16 +110,19 @@ TYPED_TEST_P(ForallRangeSegmentTest, RangeSegmentForall)
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // test zero-length range segment
-  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(3), INDEX_TYPE(3));
 
-  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(27));
-  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(2047));
-  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(32000));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(27));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(2047));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(32000));
 
   runNegativeTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallRangeSegmentTest,
-                            RangeSegmentForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallRangeSegmentTest, RangeSegmentForall);
 
 #endif  // __TEST_FORALL_RANGESEGMENT_HPP__
diff --git a/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp b/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp
index 00046e15bf..e92ec54af2 100644
--- a/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp
+++ b/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp
@@ -10,65 +10,71 @@
 
 #include <cstring>
 
-template <typename INDEX_TYPE, typename DIFF_TYPE, 
-          typename WORKING_RES, typename EXEC_POLICY>
-void ForallRangeStrideSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last, 
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY>
+void ForallRangeStrideSegmentTestImpl(INDEX_TYPE first,
+                                      INDEX_TYPE last,
                                       DIFF_TYPE stride)
 {
-  RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
+  RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(
+      RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
   INDEX_TYPE N = INDEX_TYPE(r1.size());
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource host_res {camp::resources::Host()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len); 
+  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     INDEX_TYPE idx = first;
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
-      test_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
-      idx += stride; 
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i)
+    {
+      test_array[RAJA::stripIndexType((idx - first) / stride)] = idx;
+      idx += stride;
     }
 
-    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      working_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
-    });
-
-  } else { // zero-length segment
-
-    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      (void) idx;
-      working_array[0]++;
-    });
-
+    RAJA::forall<EXEC_POLICY>(
+        r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+        { working_array[RAJA::stripIndexType((idx - first) / stride)] = idx; });
+  }
+  else
+  {  // zero-length segment
+
+    RAJA::forall<EXEC_POLICY>(r1,
+                              [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                              {
+                                (void)idx;
+                                working_array[0]++;
+                              });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -76,26 +82,42 @@ void ForallRangeStrideSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last,
 TYPED_TEST_SUITE_P(ForallRangeStrideSegmentTest);
 template <typename T>
 class ForallRangeStrideSegmentTest : public ::testing::Test
-{
-};
-
-template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{};
+
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
-{
-}
-
-template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{}
+
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
 {
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
-
-// Test negative strides
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(-10), INDEX_TYPE(-1),
+                                                DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0),
+                                                DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5),
+                                                DIFF_TYPE(3));
+
+  // Test negative strides
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(-1),
+                                                DIFF_TYPE(-1));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(0),
+                                                DIFF_TYPE(-2));
 }
 
 
@@ -104,19 +126,38 @@ TYPED_TEST_P(ForallRangeStrideSegmentTest, RangeStrideSegmentForall)
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DIFF_TYPE   = typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
-
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
-
-// Test size zero segments
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
+  using DIFF_TYPE =
+      typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
+
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20),
+                                                DIFF_TYPE(1));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20),
+                                                DIFF_TYPE(1));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20),
+                                                DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20),
+                                                DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(21),
+                                                DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(21),
+                                                DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(255),
+                                                DIFF_TYPE(2));
+
+  // Test size zero segments
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20),
+                                                DIFF_TYPE(-2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20),
+                                                DIFF_TYPE(-2));
 
   runNegativeStrideTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>();
 }
diff --git a/test/functional/indexset-build/test-aligned-indexset.cpp b/test/functional/indexset-build/test-aligned-indexset.cpp
index ec40004b2d..393d1ad7ba 100644
--- a/test/functional/indexset-build/test-aligned-indexset.cpp
+++ b/test/functional/indexset-build/test-aligned-indexset.cpp
@@ -11,7 +11,7 @@
 
 #include "RAJA_test-base.hpp"
 
-#include "RAJA/index/IndexSetBuilders.hpp" 
+#include "RAJA/index/IndexSetBuilders.hpp"
 
 #include "camp/resource.hpp"
 
@@ -21,7 +21,7 @@
 TEST(IndexSetBuild, Aligned)
 {
   const RAJA::Index_type range_min_length = 8;
-  const RAJA::Index_type range_align = 2;
+  const RAJA::Index_type range_align      = 2;
 
   using RSType = RAJA::RangeSegment;
   using LSType = RAJA::ListSegment;
@@ -36,7 +36,8 @@ TEST(IndexSetBuild, Aligned)
   indices.push_back(17);
   indices.push_back(18);
 
-  for (RAJA::Index_type i = 20; i < 28; ++i) {
+  for (RAJA::Index_type i = 20; i < 28; ++i)
+  {
     indices.push_back(i);
   }
 
@@ -44,16 +45,13 @@ TEST(IndexSetBuild, Aligned)
   indices.push_back(30);
   indices.push_back(31);
 
-  camp::resources::Resource res{camp::resources::Host()};
- 
+  camp::resources::Resource res {camp::resources::Host()};
+
   RAJA::TypedIndexSet<RAJA::RangeSegment, RAJA::ListSegment> iset;
 
-  RAJA::buildIndexSetAligned(iset, 
-                             res,
-                             &indices[0],
+  RAJA::buildIndexSetAligned(iset, res, &indices[0],
                              static_cast<RAJA::Index_type>(indices.size()),
-                             range_min_length,
-                             range_align);
+                             range_min_length, range_align);
 
   ASSERT_EQ(iset.getLength(), indices.size());
 
diff --git a/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp b/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
index 44a2a9ffa1..440239a700 100644
--- a/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
+++ b/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
@@ -28,7 +28,8 @@ void KernelBasicFissionFusionLoopTestImpl(
 {
   IDX_TYPE data_len = IDX_TYPE(0);
 
-  if (seg_idx.size() > 0) {
+  if (seg_idx.size() > 0)
+  {
     data_len = seg_idx[seg_idx.size() - 1] + 1;
   }
 
@@ -41,66 +42,60 @@ void KernelBasicFissionFusionLoopTestImpl(
   DATA_TYPE* test_array_y;
 
   allocateForallTestData<DATA_TYPE>(RAJA::stripIndexType(data_len),
-                                    erased_working_res,
-                                    &working_array_x,
-                                    &check_array_x,
-                                    &test_array_x);
+                                    erased_working_res, &working_array_x,
+                                    &check_array_x, &test_array_x);
 
   allocateForallTestData<DATA_TYPE>(RAJA::stripIndexType(data_len),
-                                    erased_working_res,
-                                    &working_array_y,
-                                    &check_array_y,
-                                    &test_array_y);
+                                    erased_working_res, &working_array_y,
+                                    &check_array_y, &test_array_y);
 
 
-  working_res.memset(working_array_x,
-                     0,
+  working_res.memset(working_array_x, 0,
                      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
   RAJA::kernel<EXEC_POLICY>(
       RAJA::make_tuple(seg, seg),
 
-      [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+      {
         RAJA::atomicAdd<RAJA::auto_atomic>(
             &working_array_x[RAJA::stripIndexType(i)], (DATA_TYPE)1);
       },
 
-      [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+      {
         RAJA::atomicAdd<RAJA::auto_atomic>(
             &working_array_x[RAJA::stripIndexType(i)], (DATA_TYPE)2);
       }
 
   );
 
-  working_res.memcpy(check_array_x,
-                     working_array_x,
+  working_res.memcpy(check_array_x, working_array_x,
                      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  memset(static_cast<void*>(check_array_y),
-         0,
+  memset(static_cast<void*>(check_array_y), 0,
          sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  RAJA::forall<RAJA::seq_exec>(working_res, seg_idx, [=](IDX_TYPE i) {
-    check_array_y[RAJA::stripIndexType(i)] += 1;
-    check_array_y[RAJA::stripIndexType(i)] += 2;
-  });
+  RAJA::forall<RAJA::seq_exec>(working_res, seg_idx,
+                               [=](IDX_TYPE i)
+                               {
+                                 check_array_y[RAJA::stripIndexType(i)] += 1;
+                                 check_array_y[RAJA::stripIndexType(i)] += 2;
+                               });
 
 
-  for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i) {
+  for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i)
+  {
     ASSERT_EQ(check_array_x[RAJA::stripIndexType(i)],
               check_array_y[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<DATA_TYPE>(erased_working_res,
-                                      working_array_x,
-                                      check_array_x,
-                                      test_array_x);
+  deallocateForallTestData<DATA_TYPE>(erased_working_res, working_array_x,
+                                      check_array_x, test_array_x);
 
 
-  deallocateForallTestData<DATA_TYPE>(erased_working_res,
-                                      working_array_y,
-                                      check_array_y,
-                                      test_array_y);
+  deallocateForallTestData<DATA_TYPE>(erased_working_res, working_array_y,
+                                      check_array_y, test_array_y);
 }
 
 #endif  // __BASIC_FISSION_FUSION_LOOP_SEGMENTS_IMPL_HPP__
diff --git a/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp b/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp
index 0627e469af..141bbc7687 100644
--- a/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp
+++ b/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp
@@ -13,18 +13,17 @@
 TYPED_TEST_SUITE_P(KernelBasicFissionFusionLoopTest);
 template <typename T>
 class KernelBasicFissionFusionLoopTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
              BasicFissionFusionLoopSegmentKernel)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES working_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_working_res{working_res};
+  WORKING_RES working_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_working_res {working_res};
 
   std::vector<IDX_TYPE> seg_idx;
 
@@ -32,18 +31,14 @@ TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
-                                       EXEC_POLICY,
-                                       WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                        RAJA::TypedRangeSegment<IDX_TYPE>>(
       r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
-                                       EXEC_POLICY,
-                                       WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                        RAJA::TypedRangeSegment<IDX_TYPE>>(
       r2, seg_idx, working_res, erased_working_res);
 
@@ -52,9 +47,7 @@ TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
   RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
-                                       EXEC_POLICY,
-                                       WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                        RAJA::TypedRangeSegment<IDX_TYPE>>(
       r3, seg_idx, working_res, erased_working_res);
 
@@ -62,18 +55,14 @@ TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
   RAJA::getIndices(seg_idx, rs1);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
-                                       EXEC_POLICY,
-                                       WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
       rs1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
-                                       EXEC_POLICY,
-                                       WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
       rs2, seg_idx, working_res, erased_working_res);
 
@@ -81,9 +70,7 @@ TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
   RAJA::getIndices(seg_idx, rs3);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
-                                       EXEC_POLICY,
-                                       WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
       rs3, seg_idx, working_res, erased_working_res);
 
@@ -91,29 +78,25 @@ TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
   seg_idx.clear();
   IDX_TYPE last = IDX_TYPE(10567);
   srand(time(NULL));
-  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i) {
+  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i)
+  {
     IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval) {
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0],
-                                      seg_idx.size(),
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
                                       erased_working_res);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
-                                       EXEC_POLICY,
-                                       WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                        RAJA::TypedListSegment<IDX_TYPE>>(
       l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr,
-                                      seg_idx.size(),
+  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
                                       erased_working_res);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
-                                       EXEC_POLICY,
-                                       WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                        RAJA::TypedListSegment<IDX_TYPE>>(
       l2, seg_idx, working_res, erased_working_res);
 }
diff --git a/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp b/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp
index e22f544062..03b5813640 100644
--- a/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp
+++ b/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp
@@ -16,15 +16,20 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename EXEC_POLICY, typename WORKING_RES, typename SEG_TYPE>
-void KernelBasicSingleICountLoopTestImpl(const SEG_TYPE& seg, 
-                                   const std::vector<IDX_TYPE>& seg_idx,
-                                   WORKING_RES working_res,
-                                   camp::resources::Resource erased_working_res)
+template <typename IDX_TYPE,
+          typename EXEC_POLICY,
+          typename WORKING_RES,
+          typename SEG_TYPE>
+void KernelBasicSingleICountLoopTestImpl(
+    const SEG_TYPE& seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    WORKING_RES working_res,
+    camp::resources::Resource erased_working_res)
 {
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
   IDX_TYPE data_len = IDX_TYPE(0);
-  if ( seg_idx.size() > 0 ) {
+  if (seg_idx.size() > 0)
+  {
     data_len = seg_idx[seg_idx.size() - 1] + 1;
   }
 
@@ -35,86 +40,79 @@ void KernelBasicSingleICountLoopTestImpl(const SEG_TYPE& seg,
   IDX_TYPE* test_array;
   IDX_TYPE* test_array_i;
 
-  if ( RAJA::stripIndexType(data_len) == 0 ) {
+  if (RAJA::stripIndexType(data_len) == 0)
+  {
     data_len++;
   }
 
-  allocateForallTestData<IDX_TYPE>(data_len,
-                                   erased_working_res,
-                                   &working_array,
-                                   &check_array,
-                                   &test_array);
+  allocateForallTestData<IDX_TYPE>(data_len, erased_working_res, &working_array,
+                                   &check_array, &test_array);
 
-  allocateForallTestData<IDX_TYPE>(data_len,
-                                   erased_working_res,
-                                   &working_array_i,
-                                   &check_array_i,
+  allocateForallTestData<IDX_TYPE>(data_len, erased_working_res,
+                                   &working_array_i, &check_array_i,
                                    &test_array_i);
 
-  memset(static_cast<void*>(test_array), 0, 
+  memset(static_cast<void*>(test_array), 0,
          sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  working_res.memcpy(working_array, test_array, 
+  working_res.memcpy(working_array, test_array,
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  working_res.memcpy(working_array_i, test_array_i, 
+  working_res.memcpy(working_array_i, test_array_i,
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  if ( RAJA::stripIndexType(idx_len) > 0 ) {
+  if (RAJA::stripIndexType(idx_len) > 0)
+  {
 
-    for (IDX_TYPE i = IDX_TYPE(0); i < idx_len; ++i) {
-      test_array  [ RAJA::stripIndexType(seg_idx[RAJA::stripIndexType(i)]) ] = 
-        seg_idx[RAJA::stripIndexType(i)];
-      test_array_i[ RAJA::stripIndexType(RAJA::stripIndexType(i)) ] = 
-        IDX_TYPE(i);
+    for (IDX_TYPE i = IDX_TYPE(0); i < idx_len; ++i)
+    {
+      test_array[RAJA::stripIndexType(seg_idx[RAJA::stripIndexType(i)])] =
+          seg_idx[RAJA::stripIndexType(i)];
+      test_array_i[RAJA::stripIndexType(RAJA::stripIndexType(i))] = IDX_TYPE(i);
     }
- 
+
     RAJA::kernel_param<EXEC_POLICY>(
-      RAJA::make_tuple(seg),
-      RAJA::make_tuple(IDX_TYPE(0)),
-      
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, IDX_TYPE i_idx) {
-        working_array[RAJA::stripIndexType(idx)] = IDX_TYPE(idx) ;
-        working_array_i[RAJA::stripIndexType(i_idx)] = IDX_TYPE(i_idx) ;
-      }
-    );
+        RAJA::make_tuple(seg), RAJA::make_tuple(IDX_TYPE(0)),
 
-  } else { // zero-length segment
+        [=] RAJA_HOST_DEVICE(IDX_TYPE idx, IDX_TYPE i_idx)
+        {
+          working_array[RAJA::stripIndexType(idx)]     = IDX_TYPE(idx);
+          working_array_i[RAJA::stripIndexType(i_idx)] = IDX_TYPE(i_idx);
+        });
+  }
+  else
+  {  // zero-length segment
 
     RAJA::kernel_param<EXEC_POLICY>(
-      RAJA::make_tuple(seg),
-      RAJA::make_tuple(IDX_TYPE(0)),
-      
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, IDX_TYPE i_idx) {
-        (void) idx; (void) i_idx;
-        working_array[0]++;
-        working_array_i[0]++;
-      }
-    );
-
+        RAJA::make_tuple(seg), RAJA::make_tuple(IDX_TYPE(0)),
+
+        [=] RAJA_HOST_DEVICE(IDX_TYPE idx, IDX_TYPE i_idx)
+        {
+          (void)idx;
+          (void)i_idx;
+          working_array[0]++;
+          working_array_i[0]++;
+        });
   }
 
-  working_res.memcpy(check_array, working_array, 
+  working_res.memcpy(check_array, working_array,
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
-  working_res.memcpy(check_array_i, working_array_i, 
+  working_res.memcpy(check_array_i, working_array_i,
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i) {
-    ASSERT_EQ( test_array[RAJA::stripIndexType(i)],
-               check_array[RAJA::stripIndexType(i)] );
-    ASSERT_EQ( test_array_i[RAJA::stripIndexType(i)],
-               check_array_i[RAJA::stripIndexType(i)] );
+  for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(test_array_i[RAJA::stripIndexType(i)],
+              check_array_i[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(erased_working_res,
-                                     working_array,
-                                     check_array,
-                                     test_array);
+  deallocateForallTestData<IDX_TYPE>(erased_working_res, working_array,
+                                     check_array, test_array);
 
-  deallocateForallTestData<IDX_TYPE>(erased_working_res,
-                                     working_array_i,
-                                     check_array_i,
-                                     test_array_i);
+  deallocateForallTestData<IDX_TYPE>(erased_working_res, working_array_i,
+                                     check_array_i, test_array_i);
 }
 
 #endif  // __BASIC_SINGLE_ICOUNT_LOOP_SEGMENTS_IMPL_HPP__
diff --git a/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp b/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp
index e6bd76fef9..d1f00123d8 100644
--- a/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp
+++ b/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp
@@ -13,88 +13,92 @@
 TYPED_TEST_SUITE_P(KernelBasicSingleICountLoopTest);
 template <typename T>
 class KernelBasicSingleICountLoopTest : public ::testing::Test
-{
-};
+{};
 
-TYPED_TEST_P(KernelBasicSingleICountLoopTest, BasicSingleICountLoopSegmentKernel)
+TYPED_TEST_P(KernelBasicSingleICountLoopTest,
+             BasicSingleICountLoopSegmentKernel)
 {
   using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES working_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_working_res{working_res};
+  WORKING_RES working_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_working_res {working_res};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 37 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
   KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedRangeSegment<IDX_TYPE>>(
-                                      r1, seg_idx, working_res, erased_working_res);
+      r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
   KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedRangeSegment<IDX_TYPE>>(
-                                      r2, seg_idx, working_res, erased_working_res);
+      r2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range segment
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 5, 5 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
   KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedRangeSegment<IDX_TYPE>>(
-                                      r3, seg_idx, working_res, erased_working_res);
+      r3, seg_idx, working_res, erased_working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
   RAJA::getIndices(seg_idx, rs1);
   KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
-                                      rs1, seg_idx, working_res, erased_working_res);
+      rs1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2( 2, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
   KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
-                                      rs2, seg_idx, working_res, erased_working_res);
+      rs2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range-stride segment
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3( 2, 2, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
   RAJA::getIndices(seg_idx, rs3);
   KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
-                                      rs3, seg_idx, working_res, erased_working_res);
+      rs3, seg_idx, working_res, erased_working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = IDX_TYPE(10567);
-  srand( time(NULL) );
-  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(), erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
+                                      erased_working_res);
   KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedListSegment<IDX_TYPE>>(
-                                      l1, seg_idx, working_res, erased_working_res);
+      l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2( nullptr, seg_idx.size(), erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
+                                      erased_working_res);
   KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedListSegment<IDX_TYPE>>(
-                                      l2, seg_idx, working_res, erased_working_res);
+      l2, seg_idx, working_res, erased_working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelBasicSingleICountLoopTest,
diff --git a/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp b/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp
index 6b4239e84a..1d6e0e5938 100644
--- a/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp
+++ b/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp
@@ -16,32 +16,42 @@
 #include <numeric>
 #include <vector>
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_resource<EXEC_POL>( segs, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
+{
+  RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args) {
-  RAJA::kernel<EXEC_POL>( segs, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel<EXEC_POL>(segs, args...);
 }
 
-template <typename IDX_TYPE, typename EXEC_POLICY, typename WORKING_RES,
-          typename SEG_TYPE, bool USE_RESOURCE>
-void KernelBasicSingleLoopTestImpl(const SEG_TYPE& seg, 
+template <typename IDX_TYPE,
+          typename EXEC_POLICY,
+          typename WORKING_RES,
+          typename SEG_TYPE,
+          bool USE_RESOURCE>
+void KernelBasicSingleLoopTestImpl(const SEG_TYPE& seg,
                                    const std::vector<IDX_TYPE>& seg_idx,
                                    WORKING_RES working_res,
                                    camp::resources::Resource erased_working_res)
 {
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
   IDX_TYPE data_len = IDX_TYPE(0);
-  if ( seg_idx.size() > 0 ) {
+  if (seg_idx.size() > 0)
+  {
     data_len = seg_idx[seg_idx.size() - 1] + 1;
   }
 
@@ -49,58 +59,56 @@ void KernelBasicSingleLoopTestImpl(const SEG_TYPE& seg,
   IDX_TYPE* check_array;
   IDX_TYPE* test_array;
 
-  if ( RAJA::stripIndexType(data_len) == 0 ) {
+  if (RAJA::stripIndexType(data_len) == 0)
+  {
     data_len++;
   }
 
-  allocateForallTestData<IDX_TYPE>(data_len,
-                                   erased_working_res,
-                                   &working_array,
-                                   &check_array,
-                                   &test_array);
+  allocateForallTestData<IDX_TYPE>(data_len, erased_working_res, &working_array,
+                                   &check_array, &test_array);
 
-  memset(static_cast<void*>(test_array), 0, 
+  memset(static_cast<void*>(test_array), 0,
          sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  working_res.memcpy(working_array, test_array, 
+  working_res.memcpy(working_array, test_array,
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  if ( RAJA::stripIndexType(idx_len) > 0 ) {
+  if (RAJA::stripIndexType(idx_len) > 0)
+  {
 
-    for (IDX_TYPE i = IDX_TYPE(0); i < idx_len; ++i) {
-      test_array[ RAJA::stripIndexType(seg_idx[RAJA::stripIndexType(i)]) ] = 
-         seg_idx[RAJA::stripIndexType(i)];
+    for (IDX_TYPE i = IDX_TYPE(0); i < idx_len; ++i)
+    {
+      test_array[RAJA::stripIndexType(seg_idx[RAJA::stripIndexType(i)])] =
+          seg_idx[RAJA::stripIndexType(i)];
     }
- 
-    call_kernel<EXEC_POLICY, USE_RESOURCE>( RAJA::make_tuple(seg), working_res,
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-        working_array[RAJA::stripIndexType(idx)] = idx;
-      }
-    );
-
-  } else { // zero-length segment
-
-    call_kernel<EXEC_POLICY, USE_RESOURCE>( RAJA::make_tuple(seg), working_res,
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-        (void) idx;
-        working_array[0]++;
-      }
-    );
 
+    call_kernel<EXEC_POLICY, USE_RESOURCE>(
+        RAJA::make_tuple(seg), working_res,
+        [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+        { working_array[RAJA::stripIndexType(idx)] = idx; });
+  }
+  else
+  {  // zero-length segment
+
+    call_kernel<EXEC_POLICY, USE_RESOURCE>(RAJA::make_tuple(seg), working_res,
+                                           [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                                           {
+                                             (void)idx;
+                                             working_array[0]++;
+                                           });
   }
 
-  working_res.memcpy(check_array, working_array, 
+  working_res.memcpy(check_array, working_array,
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i) {
-    ASSERT_EQ( test_array[RAJA::stripIndexType(i)], 
-               check_array[RAJA::stripIndexType(i)] );
+  for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(erased_working_res,
-                                     working_array,
-                                     check_array,
-                                     test_array);
+  deallocateForallTestData<IDX_TYPE>(erased_working_res, working_array,
+                                     check_array, test_array);
 }
 
 #endif  // __BASIC_SINGLE_LOOP_SEGMENTS_IMPL_HPP__
diff --git a/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp b/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp
index 5a7ce88f55..156aaf7d1d 100644
--- a/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp
+++ b/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp
@@ -13,8 +13,7 @@
 TYPED_TEST_SUITE_P(KernelBasicSingleLoopTest);
 template <typename T>
 class KernelBasicSingleLoopTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
 {
@@ -22,81 +21,88 @@ TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES working_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_working_res{working_res};
+  WORKING_RES working_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_working_res {working_res};
 
   constexpr bool USE_RES = false;
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 37 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-                                  r1, seg_idx, working_res, erased_working_res);
+      r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-                                  r2, seg_idx, working_res, erased_working_res);
+      r2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range segment
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 5, 5 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-                                  r3, seg_idx, working_res, erased_working_res);
+      r3, seg_idx, working_res, erased_working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
   RAJA::getIndices(seg_idx, rs1);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
-                                  rs1, seg_idx, working_res, erased_working_res);
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                USE_RES>(rs1, seg_idx, working_res,
+                                         erased_working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2( 2, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
-                                  rs2, seg_idx, working_res, erased_working_res);
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                USE_RES>(rs2, seg_idx, working_res,
+                                         erased_working_res);
 
   // test zero-length range-stride segment
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3( 2, 2, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
   RAJA::getIndices(seg_idx, rs3);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
-                                  rs3, seg_idx, working_res, erased_working_res);
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                USE_RES>(rs3, seg_idx, working_res,
+                                         erased_working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = IDX_TYPE(10567);
-  srand( time(NULL) );
-  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(), erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
+                                      erased_working_res);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedListSegment<IDX_TYPE>, USE_RES>(
-                                  l1, seg_idx, working_res, erased_working_res);
+      l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2( nullptr, seg_idx.size(), erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
+                                      erased_working_res);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedListSegment<IDX_TYPE>, USE_RES>(
-                                  l2, seg_idx, working_res, erased_working_res);
+      l2, seg_idx, working_res, erased_working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelBasicSingleLoopTest,
diff --git a/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp b/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp
index 6f624eab2c..eb54f4763e 100644
--- a/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp
+++ b/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp
@@ -13,8 +13,7 @@
 TYPED_TEST_SUITE_P(KernelBasicSingleLoopTest);
 template <typename T>
 class KernelBasicSingleLoopTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
 {
@@ -22,81 +21,88 @@ TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES working_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_working_res{working_res};
+  WORKING_RES working_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_working_res {working_res};
 
   constexpr bool USE_RES = true;
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 37 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-                                  r1, seg_idx, working_res, erased_working_res);
+      r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-                                  r2, seg_idx, working_res, erased_working_res);
+      r2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range segment
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 5, 5 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-                                  r3, seg_idx, working_res, erased_working_res);
+      r3, seg_idx, working_res, erased_working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
   RAJA::getIndices(seg_idx, rs1);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
-                                  rs1, seg_idx, working_res, erased_working_res);
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                USE_RES>(rs1, seg_idx, working_res,
+                                         erased_working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2( 2, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
-                                  rs2, seg_idx, working_res, erased_working_res);
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                USE_RES>(rs2, seg_idx, working_res,
+                                         erased_working_res);
 
   // test zero-length range-stride segment
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3( 2, 2, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
   RAJA::getIndices(seg_idx, rs3);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
-                                  rs3, seg_idx, working_res, erased_working_res);
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                USE_RES>(rs3, seg_idx, working_res,
+                                         erased_working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = IDX_TYPE(10567);
-  srand( time(NULL) );
-  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(), erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
+                                      erased_working_res);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedListSegment<IDX_TYPE>, USE_RES>(
-                                  l1, seg_idx, working_res, erased_working_res);
+      l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2( nullptr, seg_idx.size(), erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
+                                      erased_working_res);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedListSegment<IDX_TYPE>, USE_RES>(
-                                  l2, seg_idx, working_res, erased_working_res);
+      l2, seg_idx, working_res, erased_working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelBasicSingleLoopTest,
diff --git a/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp b/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
index 5a326b3c62..d321718390 100644
--- a/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
+++ b/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
@@ -28,7 +28,8 @@ void KernelConditionalFissionFusionLoopTestImpl(
 {
   IDX_TYPE data_len = IDX_TYPE(0);
 
-  if (seg_idx.size() > 0) {
+  if (seg_idx.size() > 0)
+  {
     data_len = seg_idx[seg_idx.size() - 1] + 1;
   }
 
@@ -41,23 +42,19 @@ void KernelConditionalFissionFusionLoopTestImpl(
   DATA_TYPE* test_array_y;
 
   allocateForallTestData<DATA_TYPE>(RAJA::stripIndexType(data_len),
-                                    erased_working_res,
-                                    &working_array_x,
-                                    &check_array_x,
-                                    &test_array_x);
+                                    erased_working_res, &working_array_x,
+                                    &check_array_x, &test_array_x);
 
   allocateForallTestData<DATA_TYPE>(RAJA::stripIndexType(data_len),
-                                    erased_working_res,
-                                    &working_array_y,
-                                    &check_array_y,
-                                    &test_array_y);
+                                    erased_working_res, &working_array_y,
+                                    &check_array_y, &test_array_y);
 
 
-  working_res.memset(working_array_x,
-                     0,
+  working_res.memset(working_array_x, 0,
                      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  for (int param = 0; param < 2; ++param) {
+  for (int param = 0; param < 2; ++param)
+  {
 
     RAJA::kernel_param<EXEC_POLICY>(
 
@@ -65,47 +62,46 @@ void KernelConditionalFissionFusionLoopTestImpl(
 
         RAJA::make_tuple(param),
 
-        [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+        {
           RAJA::atomicAdd<RAJA::auto_atomic>(
               &working_array_x[RAJA::stripIndexType(i)], (DATA_TYPE)1);
         },
 
-        [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+        {
           RAJA::atomicAdd<RAJA::auto_atomic>(
               &working_array_x[RAJA::stripIndexType(i)], (DATA_TYPE)2);
         }
 
     );
 
-    working_res.memcpy(check_array_x,
-                       working_array_x,
+    working_res.memcpy(check_array_x, working_array_x,
                        sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-    memset(static_cast<void*>(check_array_y),
-           0,
+    memset(static_cast<void*>(check_array_y), 0,
            sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-    RAJA::forall<RAJA::seq_exec>(working_res, seg_idx, [=](IDX_TYPE i) {
-      check_array_y[RAJA::stripIndexType(i)] = 3 + 3 * param;
-    });
+    RAJA::forall<RAJA::seq_exec>(working_res, seg_idx,
+                                 [=](IDX_TYPE i) {
+                                   check_array_y[RAJA::stripIndexType(i)] =
+                                       3 + 3 * param;
+                                 });
 
 
-    for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i) {
+    for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i)
+    {
       ASSERT_EQ(check_array_x[RAJA::stripIndexType(i)],
                 check_array_y[RAJA::stripIndexType(i)]);
     }
   }
 
-  deallocateForallTestData<DATA_TYPE>(erased_working_res,
-                                      working_array_x,
-                                      check_array_x,
-                                      test_array_x);
+  deallocateForallTestData<DATA_TYPE>(erased_working_res, working_array_x,
+                                      check_array_x, test_array_x);
 
 
-  deallocateForallTestData<DATA_TYPE>(erased_working_res,
-                                      working_array_y,
-                                      check_array_y,
-                                      test_array_y);
+  deallocateForallTestData<DATA_TYPE>(erased_working_res, working_array_y,
+                                      check_array_y, test_array_y);
 }
 
 #endif  // __CONDITIONAL_FISSION_FUSION_LOOP_SEGMENTS_IMPL_HPP__
diff --git a/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp b/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp
index ddb2302e60..ffe659f215 100644
--- a/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp
+++ b/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp
@@ -13,18 +13,17 @@
 TYPED_TEST_SUITE_P(KernelConditionalFissionFusionLoopTest);
 template <typename T>
 class KernelConditionalFissionFusionLoopTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
              ConditionalFissionFusionLoopSegmentKernel)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES working_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_working_res{working_res};
+  WORKING_RES working_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_working_res {working_res};
 
   std::vector<IDX_TYPE> seg_idx;
 
@@ -32,18 +31,14 @@ TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
-  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE,
-                                             EXEC_POLICY,
-                                             WORKING_RES,
+  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                              RAJA::TypedRangeSegment<IDX_TYPE>>(
       r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
-  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE,
-                                             EXEC_POLICY,
-                                             WORKING_RES,
+  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                              RAJA::TypedRangeSegment<IDX_TYPE>>(
       r2, seg_idx, working_res, erased_working_res);
 
@@ -52,9 +47,7 @@ TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
   RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
-  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE,
-                                             EXEC_POLICY,
-                                             WORKING_RES,
+  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                              RAJA::TypedRangeSegment<IDX_TYPE>>(
       r3, seg_idx, working_res, erased_working_res);
 
@@ -63,24 +56,16 @@ TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
   RAJA::getIndices(seg_idx, rs1);
   KernelConditionalFissionFusionLoopTestImpl<
-      IDX_TYPE,
-      EXEC_POLICY,
-      WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs1,
-                                               seg_idx,
-                                               working_res,
+      IDX_TYPE, EXEC_POLICY, WORKING_RES,
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs1, seg_idx, working_res,
                                                erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
   KernelConditionalFissionFusionLoopTestImpl<
-      IDX_TYPE,
-      EXEC_POLICY,
-      WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs2,
-                                               seg_idx,
-                                               working_res,
+      IDX_TYPE, EXEC_POLICY, WORKING_RES,
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs2, seg_idx, working_res,
                                                erased_working_res);
 
   // test zero-length range-stride segment
@@ -88,41 +73,33 @@ TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
   RAJA::getIndices(seg_idx, rs3);
   KernelConditionalFissionFusionLoopTestImpl<
-      IDX_TYPE,
-      EXEC_POLICY,
-      WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs3,
-                                               seg_idx,
-                                               working_res,
+      IDX_TYPE, EXEC_POLICY, WORKING_RES,
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs3, seg_idx, working_res,
                                                erased_working_res);
 
   // List segment tests
   seg_idx.clear();
   IDX_TYPE last = IDX_TYPE(10567);
   srand(time(NULL));
-  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i) {
+  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i)
+  {
     IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval) {
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0],
-                                      seg_idx.size(),
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
                                       erased_working_res);
-  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE,
-                                             EXEC_POLICY,
-                                             WORKING_RES,
+  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                              RAJA::TypedListSegment<IDX_TYPE>>(
       l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr,
-                                      seg_idx.size(),
+  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
                                       erased_working_res);
-  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE,
-                                             EXEC_POLICY,
-                                             WORKING_RES,
+  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                              RAJA::TypedListSegment<IDX_TYPE>>(
       l2, seg_idx, working_res, erased_working_res);
 }
diff --git a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp
index ddae647f83..8645ae0b33 100644
--- a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp
+++ b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp
@@ -11,80 +11,98 @@
 #include <numeric>
 #include <type_traits>
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POLICY>
-void KernelHyperplane2DTestImpl(const int groups, const int idim, const int jdim)
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void KernelHyperplane2DTestImpl(const int groups,
+                                const int idim,
+                                const int jdim)
 {
-  // This test traverses "groups" 2D arrays, and modifies values in a 1D hyperplane manner.
+  // This test traverses "groups" 2D arrays, and modifies values in a 1D
+  // hyperplane manner.
 
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   INDEX_TYPE array_length = groups * idim * jdim;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  RAJA::View<DATA_TYPE, RAJA::Layout<3, INDEX_TYPE>> HostView( test_array, groups, idim, jdim );
-  RAJA::View<DATA_TYPE, RAJA::Layout<3, INDEX_TYPE>> WorkView( work_array, groups, idim, jdim );
-  RAJA::View<DATA_TYPE, RAJA::Layout<3, INDEX_TYPE>> CheckView( check_array, groups, idim, jdim );
+  RAJA::View<DATA_TYPE, RAJA::Layout<3, INDEX_TYPE>> HostView(
+      test_array, groups, idim, jdim);
+  RAJA::View<DATA_TYPE, RAJA::Layout<3, INDEX_TYPE>> WorkView(
+      work_array, groups, idim, jdim);
+  RAJA::View<DATA_TYPE, RAJA::Layout<3, INDEX_TYPE>> CheckView(
+      check_array, groups, idim, jdim);
 
   // initialize array
-  std::iota( test_array, test_array + array_length, 1 );
+  std::iota(test_array, test_array + array_length, 1);
 
-  work_res.memcpy( work_array, test_array, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(work_array, test_array, sizeof(DATA_TYPE) * array_length);
 
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> trip_count(0);
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> oob_count(0);
 
-  // perform array arithmetic with a 1D hyperplane, in either the I or J direction
-  RAJA::TypedRangeSegment<INDEX_TYPE>  Grange( 0, groups );
-  RAJA::TypedRangeSegment<INDEX_TYPE>  Irange( 0, idim );
-  RAJA::TypedRangeSegment<INDEX_TYPE>  Jrange( 0, jdim );
-
-  RAJA::kernel<EXEC_POLICY> ( RAJA::make_tuple( Grange, Irange, Jrange ),
-    [=] RAJA_HOST_DEVICE ( INDEX_TYPE g, INDEX_TYPE ii, INDEX_TYPE jj ) {
-      if ((int)g < 0 || (int)g >= groups || (int)ii < 0 || (int)ii >= idim || (int)jj < 0 || (int)jj >= jdim) {
-        oob_count += 1;
-      }
+  // perform array arithmetic with a 1D hyperplane, in either the I or J
+  // direction
+  RAJA::TypedRangeSegment<INDEX_TYPE> Grange(0, groups);
+  RAJA::TypedRangeSegment<INDEX_TYPE> Irange(0, idim);
+  RAJA::TypedRangeSegment<INDEX_TYPE> Jrange(0, jdim);
+
+  RAJA::kernel<EXEC_POLICY>(
+      RAJA::make_tuple(Grange, Irange, Jrange),
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE g, INDEX_TYPE ii, INDEX_TYPE jj)
+      {
+        if ((int)g < 0 || (int)g >= groups || (int)ii < 0 || (int)ii >= idim ||
+            (int)jj < 0 || (int)jj >= jdim)
+        {
+          oob_count += 1;
+        }
 
-      DATA_TYPE left = 1;
-      if (ii > 0) {
-        left = WorkView(g, ii - 1, jj);
-      }
+        DATA_TYPE left = 1;
+        if (ii > 0)
+        {
+          left = WorkView(g, ii - 1, jj);
+        }
 
-      DATA_TYPE up = 1;
-      if (jj > 0) {
-        up = WorkView(g, ii, jj - 1);
-      }
+        DATA_TYPE up = 1;
+        if (jj > 0)
+        {
+          up = WorkView(g, ii, jj - 1);
+        }
 
-      WorkView(g, ii, jj) = left + up;
+        WorkView(g, ii, jj) = left + up;
 
-      trip_count += 1;
-  });
+        trip_count += 1;
+      });
 
-  work_res.memcpy( check_array, work_array, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(check_array, work_array, sizeof(DATA_TYPE) * array_length);
 
   ASSERT_EQ((INDEX_TYPE)trip_count.get(), (INDEX_TYPE)groups * idim * jdim);
   ASSERT_EQ((INDEX_TYPE)oob_count.get(), (INDEX_TYPE)0);
 
   // perform array arithmetic on the CPU
-  for (int g = 0; g < groups; ++g) {
-    for (int i = 0; i < idim; ++i) {
-      for (int j = 0; j < jdim; ++j) {
+  for (int g = 0; g < groups; ++g)
+  {
+    for (int i = 0; i < idim; ++i)
+    {
+      for (int j = 0; j < jdim; ++j)
+      {
         DATA_TYPE left = 1;
-        if (i > 0) {
+        if (i > 0)
+        {
           left = HostView(g, i - 1, j);
         }
 
         DATA_TYPE up = 1;
-        if (j > 0) {
+        if (j > 0)
+        {
           up = HostView(g, i, j - 1);
         }
 
@@ -93,42 +111,43 @@ void KernelHyperplane2DTestImpl(const int groups, const int idim, const int jdim
     }
   }
 
-  for (int g = 0; g < groups; ++g) {
-    for (int i = 0; i < idim; ++i) {
-      for (int j = 0; j < jdim; ++j) {
+  for (int g = 0; g < groups; ++g)
+  {
+    for (int i = 0; i < idim; ++i)
+    {
+      for (int j = 0; j < jdim; ++j)
+      {
         ASSERT_FLOAT_EQ(CheckView(g, i, j), HostView(g, i, j));
       }
     }
   }
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 }
 
 
 TYPED_TEST_SUITE_P(KernelHyperplane2DTest);
 template <typename T>
 class KernelHyperplane2DTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelHyperplane2DTest, Hyperplane2DKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(1, 10, 10);
-  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(2, 111, 205);
-  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(3, 213, 123);
+  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                             REDUCE_POLICY>(1, 10, 10);
+  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                             REDUCE_POLICY>(2, 111, 205);
+  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                             REDUCE_POLICY>(3, 213, 123);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelHyperplane2DTest,
-                            Hyperplane2DKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelHyperplane2DTest, Hyperplane2DKernel);
 
 #endif  // __TEST_KERNEL_HYPERPLANE_2D_HPP__
diff --git a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp
index 321f43d6a6..611d8fd3bf 100644
--- a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp
+++ b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp
@@ -11,21 +11,38 @@
 #include <numeric>
 #include <type_traits>
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POLICY>
-typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type
-KernelHyperplane3DTestImpl(const int RAJA_UNUSED_ARG(groups), const int RAJA_UNUSED_ARG(idim), const int RAJA_UNUSED_ARG(jdim), const int RAJA_UNUSED_ARG(kdim))
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+typename std::enable_if<
+    std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type
+KernelHyperplane3DTestImpl(const int RAJA_UNUSED_ARG(groups),
+                           const int RAJA_UNUSED_ARG(idim),
+                           const int RAJA_UNUSED_ARG(jdim),
+                           const int RAJA_UNUSED_ARG(kdim))
 {
   // do nothing for unsigned index types
 }
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POLICY>
-typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type
-KernelHyperplane3DTestImpl(const int groups, const int idimin, const int jdimin, const int kdimin)
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+typename std::enable_if<
+    std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type
+KernelHyperplane3DTestImpl(const int groups,
+                           const int idimin,
+                           const int jdimin,
+                           const int kdimin)
 {
-  // This test traverses "groups" number of 3D arrays, and modifies values in a 2D hyperplane manner.
+  // This test traverses "groups" number of 3D arrays, and modifies values in a
+  // 2D hyperplane manner.
 
   int idim, jdim, kdim;
-  if ( std::is_same<DATA_TYPE, float>::value )
+  if (std::is_same<DATA_TYPE, float>::value)
   {
     // Restrict to a small data size for better float precision.
     idim = 5;
@@ -39,87 +56,102 @@ KernelHyperplane3DTestImpl(const int groups, const int idimin, const int jdimin,
     kdim = kdimin;
   }
 
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   INDEX_TYPE array_length = groups * idim * jdim * kdim;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  RAJA::View<DATA_TYPE, RAJA::Layout<4, INDEX_TYPE>> HostView( test_array, groups, idim, jdim, kdim );
-  RAJA::View<DATA_TYPE, RAJA::Layout<4, INDEX_TYPE>> WorkView( work_array, groups, idim, jdim, kdim );
-  RAJA::View<DATA_TYPE, RAJA::Layout<4, INDEX_TYPE>> CheckView( check_array, groups, idim, jdim, kdim );
+  RAJA::View<DATA_TYPE, RAJA::Layout<4, INDEX_TYPE>> HostView(
+      test_array, groups, idim, jdim, kdim);
+  RAJA::View<DATA_TYPE, RAJA::Layout<4, INDEX_TYPE>> WorkView(
+      work_array, groups, idim, jdim, kdim);
+  RAJA::View<DATA_TYPE, RAJA::Layout<4, INDEX_TYPE>> CheckView(
+      check_array, groups, idim, jdim, kdim);
 
   // initialize array
-  std::iota( test_array, test_array + array_length, 1 );
+  std::iota(test_array, test_array + array_length, 1);
 
-  work_res.memcpy( work_array, test_array, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(work_array, test_array, sizeof(DATA_TYPE) * array_length);
 
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> trip_count(0);
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> oob_count(0);
 
   // perform array arithmetic with a 2D J-K hyperplane
-  RAJA::TypedRangeSegment<INDEX_TYPE>   Grange( 0, groups );
-  RAJA::TypedRangeStrideSegment<INDEX_TYPE>  Irange( 0, idim, 1 );
-  RAJA::TypedRangeStrideSegment<INDEX_TYPE>  Jrange( jdim-1, -1, -1 );
-  RAJA::TypedRangeStrideSegment<INDEX_TYPE>  Krange( 0, kdim, 1 );
-
-  RAJA::kernel<EXEC_POLICY> ( RAJA::make_tuple( Grange, Irange, Jrange, Krange ),
-    [=] RAJA_HOST_DEVICE ( INDEX_TYPE g, INDEX_TYPE ii, INDEX_TYPE jj, INDEX_TYPE kk ) {
-      if (g < 0 || g >= groups || ii < 0 || ii >= idim || jj < 0 || jj >= jdim || kk < 0 || kk >= kdim) {
-        oob_count += 1;
-      }
-
-      DATA_TYPE left = 1;
-      if (ii > 0) {
-        left = WorkView(g, ii - 1, jj, kk);
-      }
-
-      DATA_TYPE up = 1;
-      if (jj > 0) {
-        up = WorkView(g, ii, jj - 1, kk);
-      }
-
-      DATA_TYPE back = 1;
-      if (kk > 0) {
-        back = WorkView(g, ii, jj, kk - 1);
-      }
-
-      WorkView(g, ii, jj, kk) = left + up + back;
-
-      trip_count += 1;
-  });
-
-  work_res.memcpy( check_array, work_array, sizeof(DATA_TYPE) * array_length );
-
-  ASSERT_EQ((INDEX_TYPE)trip_count.get(), (INDEX_TYPE)groups * idim * jdim * kdim);
+  RAJA::TypedRangeSegment<INDEX_TYPE> Grange(0, groups);
+  RAJA::TypedRangeStrideSegment<INDEX_TYPE> Irange(0, idim, 1);
+  RAJA::TypedRangeStrideSegment<INDEX_TYPE> Jrange(jdim - 1, -1, -1);
+  RAJA::TypedRangeStrideSegment<INDEX_TYPE> Krange(0, kdim, 1);
+
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(Grange, Irange, Jrange, Krange),
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE g, INDEX_TYPE ii,
+                                                 INDEX_TYPE jj, INDEX_TYPE kk)
+                            {
+                              if (g < 0 || g >= groups || ii < 0 ||
+                                  ii >= idim || jj < 0 || jj >= jdim ||
+                                  kk < 0 || kk >= kdim)
+                              {
+                                oob_count += 1;
+                              }
+
+                              DATA_TYPE left = 1;
+                              if (ii > 0)
+                              {
+                                left = WorkView(g, ii - 1, jj, kk);
+                              }
+
+                              DATA_TYPE up = 1;
+                              if (jj > 0)
+                              {
+                                up = WorkView(g, ii, jj - 1, kk);
+                              }
+
+                              DATA_TYPE back = 1;
+                              if (kk > 0)
+                              {
+                                back = WorkView(g, ii, jj, kk - 1);
+                              }
+
+                              WorkView(g, ii, jj, kk) = left + up + back;
+
+                              trip_count += 1;
+                            });
+
+  work_res.memcpy(check_array, work_array, sizeof(DATA_TYPE) * array_length);
+
+  ASSERT_EQ((INDEX_TYPE)trip_count.get(),
+            (INDEX_TYPE)groups * idim * jdim * kdim);
   ASSERT_EQ((INDEX_TYPE)oob_count.get(), (INDEX_TYPE)0);
 
   // perform array arithmetic on the CPU
-  for (int g = 0; g < groups; ++g) {
-    for (int i = 0; i < idim; ++i) {
-      for (int j = jdim - 1; j >= 0; --j) {
-        for (int k = 0; k < kdim; ++k) {
+  for (int g = 0; g < groups; ++g)
+  {
+    for (int i = 0; i < idim; ++i)
+    {
+      for (int j = jdim - 1; j >= 0; --j)
+      {
+        for (int k = 0; k < kdim; ++k)
+        {
           DATA_TYPE left = 1;
-          if (i > 0) {
+          if (i > 0)
+          {
             left = HostView(g, i - 1, j, k);
           }
 
           DATA_TYPE up = 1;
-          if (j > 0) {
+          if (j > 0)
+          {
             up = HostView(g, i, j - 1, k);
           }
 
           DATA_TYPE back = 1;
-          if (k > 0) {
+          if (k > 0)
+          {
             back = HostView(g, i, j, k - 1);
           }
 
@@ -129,44 +161,46 @@ KernelHyperplane3DTestImpl(const int groups, const int idimin, const int jdimin,
     }
   }
 
-  for (int g = 0; g < groups; ++g) {
-    for (int i = 0; i < idim; ++i) {
-      for (int j = 0; j < jdim; ++j) {
-        for (int k = 0; k < kdim; ++k) {
+  for (int g = 0; g < groups; ++g)
+  {
+    for (int i = 0; i < idim; ++i)
+    {
+      for (int j = 0; j < jdim; ++j)
+      {
+        for (int k = 0; k < kdim; ++k)
+        {
           ASSERT_FLOAT_EQ(CheckView(g, i, j, k), HostView(g, i, j, k));
         }
       }
     }
   }
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 }
 
 
 TYPED_TEST_SUITE_P(KernelHyperplane3DTest);
 template <typename T>
 class KernelHyperplane3DTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelHyperplane3DTest, Hyperplane3DKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(1, 10, 10, 10);
-  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(2, 151, 111, 205);
-  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(3, 101, 213, 123);
+  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                             REDUCE_POLICY>(1, 10, 10, 10);
+  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                             REDUCE_POLICY>(2, 151, 111, 205);
+  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                             REDUCE_POLICY>(3, 101, 213, 123);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelHyperplane3DTest,
-                            Hyperplane3DKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelHyperplane3DTest, Hyperplane3DKernel);
 
 #endif  // __TEST_KERNEL_HYPERPLANE_3D_HPP__
diff --git a/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp b/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp
index 10923b9da2..a58cc80812 100644
--- a/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp
+++ b/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp
@@ -15,10 +15,15 @@
 #include <random>
 #include <type_traits>
 
-template <typename EXEC_POLICY, typename REDUCE_POLICY, typename ABSTRACTION,
-          typename DATA_TYPE, typename IDX_TYPE,
-          typename SEGMENTS_TYPE, typename Container,
-          typename WORKING_RES, typename RandomGenerator>
+template <typename EXEC_POLICY,
+          typename REDUCE_POLICY,
+          typename ABSTRACTION,
+          typename DATA_TYPE,
+          typename IDX_TYPE,
+          typename SEGMENTS_TYPE,
+          typename Container,
+          typename WORKING_RES,
+          typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 // add bool return type to disambiguate signatures of these functions for MSVC
 std::enable_if_t<!ABSTRACTION::template supports<DATA_TYPE>(), bool>
@@ -26,12 +31,19 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE&,
                                 const Container&,
                                 WORKING_RES,
                                 RandomGenerator&)
-{ return false; }
+{
+  return false;
+}
 ///
-template <typename EXEC_POLICY, typename REDUCE_POLICY, typename ABSTRACTION,
-          typename DATA_TYPE, typename IDX_TYPE,
-          typename SEGMENTS_TYPE, typename Container,
-          typename WORKING_RES, typename RandomGenerator>
+template <typename EXEC_POLICY,
+          typename REDUCE_POLICY,
+          typename ABSTRACTION,
+          typename DATA_TYPE,
+          typename IDX_TYPE,
+          typename SEGMENTS_TYPE,
+          typename Container,
+          typename WORKING_RES,
+          typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 std::enable_if_t<ABSTRACTION::template supports<DATA_TYPE>()>
 KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
@@ -40,7 +52,8 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
                                 RandomGenerator& rngen)
 {
   using RAJA::get;
-  using MULTIREDUCER = typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
+  using MULTIREDUCER =
+      typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
 
   auto si = get<2>(segments);
   auto sj = get<1>(segments);
@@ -50,13 +63,13 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
   RAJA_EXTRACT_BED_SUFFIXED(sj, _sj);
   RAJA_EXTRACT_BED_SUFFIXED(sk, _sk);
 
-  IDX_TYPE dimi = begin_si[distance_si-1] + 1;
-  IDX_TYPE dimj = begin_sj[distance_sj-1] + 1;
-  IDX_TYPE dimk = begin_sk[distance_sk-1] + 1;
+  IDX_TYPE dimi = begin_si[distance_si - 1] + 1;
+  IDX_TYPE dimj = begin_sj[distance_sj - 1] + 1;
+  IDX_TYPE dimk = begin_sk[distance_sk - 1] + 1;
 
   const IDX_TYPE idx_range = dimi * dimj * dimk;
 
-  const int modval = 100;
+  const int modval      = 100;
   const size_t num_bins = multi_init.size();
 
   IDX_TYPE* working_range;
@@ -73,51 +86,50 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
 
   IDX_TYPE data_len = 0;
 
-  allocateForallTestData(idx_range+1,
-                         working_res,
-                         &working_range,
-                         &check_range,
-                         &test_range);
+  allocateForallTestData(idx_range + 1, working_res, &working_range,
+                         &check_range, &test_range);
 
-  for (IDX_TYPE i = 0; i < idx_range+1; ++i) {
+  for (IDX_TYPE i = 0; i < idx_range + 1; ++i)
+  {
     test_range[i] = ~IDX_TYPE(0);
   }
 
   {
-    std::uniform_int_distribution<IDX_TYPE> work_per_iterate_distribution(0, num_bins);
-
-    for (IDX_TYPE k : sk) {
-      for (IDX_TYPE j : sj) {
-        for (IDX_TYPE i : si) {
-          IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+    std::uniform_int_distribution<IDX_TYPE> work_per_iterate_distribution(
+        0, num_bins);
+
+    for (IDX_TYPE k : sk)
+    {
+      for (IDX_TYPE j : sj)
+      {
+        for (IDX_TYPE i : si)
+        {
+          IDX_TYPE ii    = (dimi * dimj * k) + (dimi * j) + i;
           test_range[ii] = data_len;
           data_len += work_per_iterate_distribution(rngen);
-          test_range[ii+1] = data_len;
+          test_range[ii + 1] = data_len;
         }
       }
     }
   }
 
-  allocateForallTestData(data_len,
-                         working_res,
-                         &working_array,
-                         &check_array,
+  allocateForallTestData(data_len, working_res, &working_array, &check_array,
                          &test_array);
 
-  allocateForallTestData(data_len,
-                         working_res,
-                         &working_bins,
-                         &check_bins,
+  allocateForallTestData(data_len, working_res, &working_bins, &check_bins,
                          &test_bins);
 
-  if (data_len > IDX_TYPE(0)) {
+  if (data_len > IDX_TYPE(0))
+  {
 
-    // use ints to initialize array here to avoid floating point precision issues
-    std::uniform_int_distribution<int> array_int_distribution(0, modval-1);
-    std::uniform_int_distribution<IDX_TYPE> bin_distribution(0, num_bins-1);
+    // use ints to initialize array here to avoid floating point precision
+    // issues
+    std::uniform_int_distribution<int> array_int_distribution(0, modval - 1);
+    std::uniform_int_distribution<IDX_TYPE> bin_distribution(0, num_bins - 1);
 
 
-    for (IDX_TYPE i = 0; i < data_len; ++i) {
+    for (IDX_TYPE i = 0; i < data_len; ++i)
+    {
       test_array[i] = DATA_TYPE(array_int_distribution(rngen));
 
       // this may use the same bin multiple times per iterate
@@ -125,7 +137,8 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
     }
   }
 
-  working_res.memcpy(working_range, test_range, sizeof(IDX_TYPE) * (idx_range+1));
+  working_res.memcpy(working_range, test_range,
+                     sizeof(IDX_TYPE) * (idx_range + 1));
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
   working_res.memcpy(working_bins, test_bins, sizeof(IDX_TYPE) * data_len);
 
@@ -137,21 +150,28 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
   {
     std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
 
-    for (IDX_TYPE i = 0; i < data_len; ++i) {
-      ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+    for (IDX_TYPE i = 0; i < data_len; ++i)
+    {
+      ref_vals[test_bins[i]] =
+          ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
     }
 
-    RAJA::kernel_resource<EXEC_POLICY>(segments, working_res,
-        [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
-      IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
-      for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
-        ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
-        ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]);
-      }
-    });
+    RAJA::kernel_resource<EXEC_POLICY>(
+        segments, working_res,
+        [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i)
+        {
+          IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+          for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
+               ++idx)
+          {
+            ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
+            ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]);
+          }
+        });
 
     size_t bin = 0;
-    for (auto init_val : multi_init) {
+    for (auto init_val : multi_init)
+    {
       ASSERT_EQ(DATA_TYPE(red[bin].get()), ref_vals[bin]);
       ASSERT_EQ(red2.get(bin), ABSTRACTION::combine(ref_vals[bin], init_val));
       ++bin;
@@ -161,46 +181,60 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
 
   red.reset();
 
-  // basic multiple use test, ensure same reducer can combine values from multiple loops
+  // basic multiple use test, ensure same reducer can combine values from
+  // multiple loops
   {
     std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
 
     const int nloops = 2;
-    for (int j = 0; j < nloops; ++j) {
+    for (int j = 0; j < nloops; ++j)
+    {
 
-      for (IDX_TYPE i = 0; i < data_len; ++i) {
-        ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+      for (IDX_TYPE i = 0; i < data_len; ++i)
+      {
+        ref_vals[test_bins[i]] =
+            ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
       }
 
-      RAJA::kernel_resource<EXEC_POLICY>(segments, working_res,
-          [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
-        IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
-        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
-          ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
-        }
-      });
+      RAJA::kernel_resource<EXEC_POLICY>(
+          segments, working_res,
+          [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i)
+          {
+            IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+            for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
+                 ++idx)
+            {
+              ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
+            }
+          });
     }
 
-    for (size_t bin = 0; bin < num_bins; ++bin) {
+    for (size_t bin = 0; bin < num_bins; ++bin)
+    {
       ASSERT_EQ(static_cast<DATA_TYPE>(red[bin].get()), ref_vals[bin]);
     }
   }
 
 
   // test the consistency of answers, if we expect them to be consistent
-  if (ABSTRACTION::consistent(red)) {
+  if (ABSTRACTION::consistent(red))
+  {
 
-    if /* constexpr */ (std::is_floating_point<DATA_TYPE>::value) {
+    if /* constexpr */ (std::is_floating_point<DATA_TYPE>::value)
+    {
 
       // use floating point values to accentuate floating point precision issues
       std::conditional_t<!std::is_floating_point<DATA_TYPE>::value,
-          std::uniform_int_distribution<DATA_TYPE>,
-          std::uniform_real_distribution<DATA_TYPE>> array_flt_distribution(0, modval-1);
+                         std::uniform_int_distribution<DATA_TYPE>,
+                         std::uniform_real_distribution<DATA_TYPE>>
+          array_flt_distribution(0, modval - 1);
 
-      for (IDX_TYPE i = 0; i < data_len; ++i) {
+      for (IDX_TYPE i = 0; i < data_len; ++i)
+      {
         test_array[i] = DATA_TYPE(array_flt_distribution(rngen));
       }
-      working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
+      working_res.memcpy(working_array, test_array,
+                         sizeof(DATA_TYPE) * data_len);
     }
 
 
@@ -208,23 +242,32 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
     bool got_ref_vals = false;
 
     const int nloops = 2;
-    for (int j = 0; j < nloops; ++j) {
+    for (int j = 0; j < nloops; ++j)
+    {
       red.reset();
 
-      RAJA::kernel_resource<EXEC_POLICY>(segments, working_res,
-          [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
-        IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
-        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
-          ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
-        }
-      });
-
-      if (!got_ref_vals) {
+      RAJA::kernel_resource<EXEC_POLICY>(
+          segments, working_res,
+          [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i)
+          {
+            IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+            for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
+                 ++idx)
+            {
+              ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
+            }
+          });
+
+      if (!got_ref_vals)
+      {
         ref_vals.resize(num_bins);
         red.get_all(ref_vals);
         got_ref_vals = true;
-      } else {
-        for (size_t bin = 0; bin < num_bins; ++bin) {
+      }
+      else
+      {
+        for (size_t bin = 0; bin < num_bins; ++bin)
+        {
           ASSERT_EQ(red.get(bin), ref_vals[bin]);
         }
       }
@@ -232,76 +275,67 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
   }
 
 
-  deallocateForallTestData(working_res,
-                           working_bins,
-                           check_bins,
-                           test_bins);
-  deallocateForallTestData(working_res,
-                           working_array,
-                           check_array,
-                           test_array);
-  deallocateForallTestData(working_res,
-                           working_range,
-                           check_range,
-                           test_range);
+  deallocateForallTestData(working_res, working_bins, check_bins, test_bins);
+  deallocateForallTestData(working_res, working_array, check_array, test_array);
+  deallocateForallTestData(working_res, working_range, check_range, test_range);
 }
 
 
 TYPED_TEST_SUITE_P(KernelMultiReduceNestedTest);
 template <typename T>
 class KernelMultiReduceNestedTest : public ::testing::Test
-{
-};
+{};
 
 //
 //
 // Defining the Kernel Loop structure for MultiReduce Nested Loop Tests.
 //
 //
-template<typename POLICY_TYPE, typename POLICY_DATA>
+template <typename POLICY_TYPE, typename POLICY_DATA>
 struct MultiReduceNestedLoopExec;
 
-template<typename POLICY_DATA>
-struct MultiReduceNestedLoopExec<DEPTH_3, POLICY_DATA> {
-  using type =
-    RAJA::KernelPolicy<
-      RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >
-    >;
+template <typename POLICY_DATA>
+struct MultiReduceNestedLoopExec<DEPTH_3, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::For<
+      0,
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::statement::For<
+          1,
+          typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::statement::For<
+              2,
+              typename camp::at<POLICY_DATA, camp::num<2>>::type,
+              RAJA::statement::Lambda<0>>>>>;
 };
 
-template<typename POLICY_DATA>
-struct MultiReduceNestedLoopExec<DEPTH_3_COLLAPSE, POLICY_DATA> {
-  using type =
-    RAJA::KernelPolicy<
-      RAJA::statement::Collapse< typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::ArgList<0,1,2>,
-        RAJA::statement::Lambda<0>
-      >
-    >;
+template <typename POLICY_DATA>
+struct MultiReduceNestedLoopExec<DEPTH_3_COLLAPSE, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::Collapse<
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::ArgList<0, 1, 2>,
+      RAJA::statement::Lambda<0>>>;
 };
 
-#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or defined(RAJA_ENABLE_SYCL)
+#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or                   \
+    defined(RAJA_ENABLE_SYCL)
 
-template<typename POLICY_DATA>
-struct MultiReduceNestedLoopExec<DEVICE_DEPTH_3, POLICY_DATA> {
+template <typename POLICY_DATA>
+struct MultiReduceNestedLoopExec<DEVICE_DEPTH_3, POLICY_DATA>
+{
   using type =
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-            RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
-              RAJA::statement::Lambda<0>
-            >
-          >
-        >
-      > // end DEVICE_KERNEL
-    >;
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::For<
+                  2,
+                  typename camp::at<POLICY_DATA, camp::num<2>>::type,
+                  RAJA::statement::Lambda<0>>>>>  // end DEVICE_KERNEL
+                         >;
 };
 
 #endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP or RAJA_ENABLE_SYCL
@@ -317,46 +351,52 @@ TYPED_TEST_P(KernelMultiReduceNestedTest, MultiReduceNestedKernel)
 
   using LOOP_TYPE = typename EXEC_POL_DATA::LoopType;
   using LOOP_POLS = typename EXEC_POL_DATA::type;
-  using EXEC_POLICY = typename MultiReduceNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename MultiReduceNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
 
   // for setting random values in arrays
-  auto random_seed = std::random_device{}();
+  auto random_seed = std::random_device {}();
   std::mt19937 rngen(random_seed);
 
-  WORKING_RES working_res{WORKING_RES::get_default()};
+  WORKING_RES working_res {WORKING_RES::get_default()};
 
   std::vector<DATA_TYPE> container;
 
   std::vector<size_t> num_bins_max_container({0, 1, 100});
   size_t num_bins_min = 0;
-  for (size_t num_bins_max : num_bins_max_container) {
+  for (size_t num_bins_max : num_bins_max_container)
+  {
 
-    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min, num_bins_max);
-    num_bins_min = num_bins_max+1;
+    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min,
+                                                        num_bins_max);
+    num_bins_min    = num_bins_max + 1;
     size_t num_bins = num_bins_dist(rngen);
 
     container.resize(num_bins, DATA_TYPE(2));
 
     // Range segment tests
-    auto s1 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>( 0, 2 ),
-                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 7 ),
-                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 3 ));
-    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
-                                   s1, container, working_res, rngen);
-
-    auto s2 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>( 2, 35 ),
-                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 19 ),
-                               RAJA::TypedRangeSegment<IDX_TYPE>( 3, 13 ));
-    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
-                                   s2, container, working_res, rngen);
+    auto s1 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, 2),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 7),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 3));
+    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
+                                    DATA_TYPE, IDX_TYPE>(s1, container,
+                                                         working_res, rngen);
+
+    auto s2 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(2, 35),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 19),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(3, 13));
+    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
+                                    DATA_TYPE, IDX_TYPE>(s2, container,
+                                                         working_res, rngen);
 
     // Range-stride segment tests
-    auto s3 = RAJA::make_tuple(RAJA::TypedRangeStrideSegment<IDX_TYPE>( 0, 6, 2 ),
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>( 1, 38, 3 ),
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>( 5, 17, 1 ));
-    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
-                                   s3, container, working_res, rngen);
-
+    auto s3 =
+        RAJA::make_tuple(RAJA::TypedRangeStrideSegment<IDX_TYPE>(0, 6, 2),
+                         RAJA::TypedRangeStrideSegment<IDX_TYPE>(1, 38, 3),
+                         RAJA::TypedRangeStrideSegment<IDX_TYPE>(5, 17, 1));
+    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
+                                    DATA_TYPE, IDX_TYPE>(s3, container,
+                                                         working_res, rngen);
   }
 }
 
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp
index 216aee14d6..ae856ae553 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp
@@ -10,22 +10,30 @@
 
 #include <numeric>
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_param_resource<EXEC_POL>( segs, params, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type call_kernel(SEGMENTS&& segs,
+                                                        PARAMS&& params,
+                                                        WORKING_RES work_res,
+                                                        Args&&... args)
+{
+  RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args) {
-  RAJA::kernel_param<EXEC_POL>( segs, params, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel_param<EXEC_POL>(segs, params, args...);
 }
 
 //
@@ -33,80 +41,81 @@ typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, PARA
 // Define list of nested loop types the Block test supports.
 //
 //
-using BlockReduceSumSupportedLoopTypeList = camp::list<
-  DEPTH_1_REDUCESUM,
-  DEVICE_DEPTH_1_REDUCESUM
-  >;
+using BlockReduceSumSupportedLoopTypeList =
+    camp::list<DEPTH_1_REDUCESUM, DEVICE_DEPTH_1_REDUCESUM>;
 
 //
 //
 // Nest loop trip count test.
 //
 //
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
-void KernelNestedLoopTest(const DEPTH_1_REDUCESUM&, const int N){
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
+void KernelNestedLoopTest(const DEPTH_1_REDUCESUM&, const int N)
+{
 
-  WORKING_RES work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+  WORKING_RES work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   // Allocate Tests Data
-  int * work_array;
-  int * check_array;
-  int * test_array;
-
-  allocateForallTestData<int>(N,
-                              erased_work_res,
-                              &work_array,
-                              &check_array,
+  int* work_array;
+  int* check_array;
+  int* test_array;
+
+  allocateForallTestData<int>(N, erased_work_res, &work_array, &check_array,
                               &test_array);
 
-  RAJA::TypedRangeSegment<int> range(0,N);
+  RAJA::TypedRangeSegment<int> range(0, N);
 
   // Initialize Data
   std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
 
-  erased_work_res.memcpy(work_array, test_array, sizeof(int) * RAJA::stripIndexType(N));
-  
+  erased_work_res.memcpy(work_array, test_array,
+                         sizeof(int) * RAJA::stripIndexType(N));
+
   RAJA::ReduceSum<REDUCE_POL, int> worksum(0);
 
   // Calculate Working data
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
-    RAJA::make_tuple(RAJA::RangeSegment(0, N)),
-    RAJA::make_tuple<int>(0),
+      RAJA::make_tuple(RAJA::RangeSegment(0, N)), RAJA::make_tuple<int>(0),
 
-    // Resource
-    work_res,
+      // Resource
+      work_res,
 
-    // lambda 0, only runs for sequential
-    [=] RAJA_HOST_DEVICE (RAJA::Index_type i, int & value) {
-       value = work_array[i];
-    },
+      // lambda 0, only runs for sequential
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, int& value)
+      { value = work_array[i]; },
 
-    // lambda 1, only runs for device
-    [=] RAJA_HOST_DEVICE (RAJA::Index_type i, int & value) {
-       value += work_array[i];
-    },
+      // lambda 1, only runs for device
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, int& value)
+      { value += work_array[i]; },
 
-    // lambda 2, (reduction) runs for both sequential and device
-    // Device: This only gets executed on the "root" thread which received the reduced value.
-    [=] RAJA_HOST_DEVICE (int & value) {
-       worksum += value;
-    }
+      // lambda 2, (reduction) runs for both sequential and device
+      // Device: This only gets executed on the "root" thread which received the
+      // reduced value.
+      [=] RAJA_HOST_DEVICE(int& value) { worksum += value; }
 
   );
 
-  ASSERT_EQ(worksum.get(), N*(N-1)/2);
+  ASSERT_EQ(worksum.get(), N * (N - 1) / 2);
 
-  deallocateForallTestData<int>(erased_work_res,
-                                work_array,
-                                check_array,
+  deallocateForallTestData<int>(erased_work_res, work_array, check_array,
                                 test_array);
 }
 
-// DEVICE_ and DEPTH_1_REDUCESUM execution policies use the above DEPTH_1_REDUCESUM test.
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEVICE_DEPTH_1_REDUCESUM&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(DEPTH_1_REDUCESUM(), args...);
+// DEVICE_ and DEPTH_1_REDUCESUM execution policies use the above
+// DEPTH_1_REDUCESUM test.
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEVICE_DEPTH_1_REDUCESUM&, Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
+      DEPTH_1_REDUCESUM(), args...);
 }
 
 //
@@ -114,35 +123,41 @@ void KernelNestedLoopTest(const DEVICE_DEPTH_1_REDUCESUM&, Args... args){
 // Defining the Kernel Loop structure for Block Nested Loop Tests.
 //
 //
-template<typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
+template <typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
 struct BlockNestedLoopExec;
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct BlockNestedLoopExec<DEPTH_1_REDUCESUM, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type, RAJA::statement::Lambda<0>,
-        RAJA::statement::Reduce<typename camp::at<POLICY_DATA, camp::num<1>>::type, RAJA::operators::plus, RAJA::statement::Param<0>,
-          RAJA::statement::Lambda<2, RAJA::Params<0>>
-        >
-      >
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct BlockNestedLoopExec<DEPTH_1_REDUCESUM, REDUCE_POL, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::For<
+      0,
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::statement::Lambda<0>,
+      RAJA::statement::Reduce<
+          typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::operators::plus,
+          RAJA::statement::Param<0>,
+          RAJA::statement::Lambda<2, RAJA::Params<0>>>>>;
 };
 
 #if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct BlockNestedLoopExec<DEVICE_DEPTH_1_REDUCESUM, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-      RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type, RAJA::statement::Lambda<1>>,
-        RAJA::statement::Reduce<typename camp::at<POLICY_DATA, camp::num<1>>::type, RAJA::operators::plus, RAJA::statement::Param<0>,
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct BlockNestedLoopExec<DEVICE_DEPTH_1_REDUCESUM, REDUCE_POL, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<
+      RAJA::statement::For<0,
+                           typename camp::at<POLICY_DATA, camp::num<0>>::type,
+                           RAJA::statement::Lambda<1>>,
+      RAJA::statement::Reduce<
+          typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::operators::plus,
+          RAJA::statement::Param<0>,
           RAJA::statement::Lambda<2, RAJA::Params<0>>
-          // Device: Lambda 2 only gets executed on the "root" thread which received the reduced value.
-        >
-      > // end DEVICE_KERNEL
-    >;
+          // Device: Lambda 2 only gets executed on the "root" thread which
+          // received the reduced value.
+          >>  // end DEVICE_KERNEL
+                                  >;
 };
 
 #endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp
index 54934bef6d..ee001ba8d8 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp
@@ -10,20 +10,26 @@
 
 #include <numeric>
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_resource<EXEC_POL>( segs, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
+{
+  RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args) {
-  RAJA::kernel<EXEC_POL>( segs, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel<EXEC_POL>(segs, args...);
 }
 
 //
@@ -31,94 +37,131 @@ typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORK
 // Define list of nested loop types the ReduceSum test supports.
 //
 //
-using ReduceSumSupportedLoopTypeList = camp::list<
-  DEPTH_3_REDUCESUM,
-  DEPTH_3_REDUCESUM_SEQ_INNER,
-  DEPTH_3_REDUCESUM_SEQ_OUTER,
-  DEVICE_DEPTH_3_REDUCESUM,
-  DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER,
-  DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER>;
+using ReduceSumSupportedLoopTypeList =
+    camp::list<DEPTH_3_REDUCESUM,
+               DEPTH_3_REDUCESUM_SEQ_INNER,
+               DEPTH_3_REDUCESUM_SEQ_OUTER,
+               DEVICE_DEPTH_3_REDUCESUM,
+               DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER,
+               DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER>;
 
 //
 //
 // ReduceSum 3D Matrix index calculation per element.
 //
 //
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
 void KernelNestedLoopTest(const DEPTH_3_REDUCESUM&,
                           const RAJA::Index_type dim0,
                           const RAJA::Index_type dim1,
-                          const RAJA::Index_type dim2){
-  WORKING_RES work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+                          const RAJA::Index_type dim2)
+{
+  WORKING_RES work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type flatSize = dim0 * dim1 * dim2;
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(flatSize,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(
+      flatSize, erased_work_res, &work_array, &check_array, &test_array);
 
-  RAJA::TypedRangeSegment<RAJA::Index_type> rangeflat(0,flatSize);
+  RAJA::TypedRangeSegment<RAJA::Index_type> rangeflat(0, flatSize);
   RAJA::TypedRangeSegment<RAJA::Index_type> range0(0, dim0);
   RAJA::TypedRangeSegment<RAJA::Index_type> range1(0, dim1);
   RAJA::TypedRangeSegment<RAJA::Index_type> range2(0, dim2);
 
   std::iota(test_array, test_array + RAJA::stripIndexType(flatSize), 0);
 
-  erased_work_res.memcpy(work_array, test_array, sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
+  erased_work_res.memcpy(work_array, test_array,
+                         sizeof(RAJA::Index_type) *
+                             RAJA::stripIndexType(flatSize));
 
   constexpr int Depth = 3;
-  RAJA::View< RAJA::Index_type, RAJA::Layout<Depth> > work_view(work_array, dim0, dim1, dim2);
+  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(work_array, dim0,
+                                                              dim1, dim2);
 
   RAJA::ReduceSum<RAJA::seq_reduce, RAJA::Index_type> hostsum(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
 
-  call_kernel<EXEC_POLICY, USE_RESOURCE>(RAJA::make_tuple(range0, range1, range2), work_res,
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type i, RAJA::Index_type j, RAJA::Index_type k) {
-                              worksum += work_view(i,j,k);
-                            });
+  call_kernel<EXEC_POLICY, USE_RESOURCE>(
+      RAJA::make_tuple(range0, range1, range2), work_res,
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j,
+                           RAJA::Index_type k)
+      { worksum += work_view(i, j, k); });
 
-  RAJA::forall<RAJA::seq_exec>(rangeflat, [=] (RAJA::Index_type i) {
-    hostsum += test_array[RAJA::stripIndexType(i)];
-  });
+  RAJA::forall<RAJA::seq_exec>(rangeflat,
+                               [=](RAJA::Index_type i) {
+                                 hostsum += test_array[RAJA::stripIndexType(i)];
+                               });
 
   ASSERT_EQ(hostsum.get(), worksum.get());
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
-// DEVICE_ and DEPTH_3_REDUCESUM_SEQ_ execution policies use the above DEPTH_3_REDUCESUM test.
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEPTH_3_REDUCESUM_SEQ_OUTER&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(DEPTH_3_REDUCESUM(), args...);
+// DEVICE_ and DEPTH_3_REDUCESUM_SEQ_ execution policies use the above
+// DEPTH_3_REDUCESUM test.
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEPTH_3_REDUCESUM_SEQ_OUTER&, Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
+      DEPTH_3_REDUCESUM(), args...);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEPTH_3_REDUCESUM_SEQ_INNER&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(DEPTH_3_REDUCESUM(), args...);
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEPTH_3_REDUCESUM_SEQ_INNER&, Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
+      DEPTH_3_REDUCESUM(), args...);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(DEPTH_3_REDUCESUM(), args...);
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM&, Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
+      DEPTH_3_REDUCESUM(), args...);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(DEPTH_3_REDUCESUM(), args...);
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER&,
+                          Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
+      DEPTH_3_REDUCESUM(), args...);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(DEPTH_3_REDUCESUM(), args...);
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER&,
+                          Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
+      DEPTH_3_REDUCESUM(), args...);
 }
 
 //
@@ -126,99 +169,113 @@ void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER&, Args... arg
 // Defining the Kernel Loop structure for ReduceSum Nested Loop Tests.
 //
 //
-template<typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
+template <typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
 struct ReduceSumNestedLoopExec;
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM, REDUCE_POL, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::For<
+      0,
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::statement::For<
+          1,
+          typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::statement::For<
+              2,
+              typename camp::at<POLICY_DATA, camp::num<2>>::type,
+              RAJA::statement::Lambda<0>>>>>;
 };
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM_SEQ_OUTER, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<0, RAJA::seq_exec,
-        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM_SEQ_OUTER,
+                               REDUCE_POL,
+                               POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::For<
+      0,
+      RAJA::seq_exec,
+      RAJA::statement::For<
+          1,
+          typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::statement::For<
+              2,
+              typename camp::at<POLICY_DATA, camp::num<2>>::type,
+              RAJA::statement::Lambda<0>>>>>;
 };
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM_SEQ_INNER, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::For<2, RAJA::seq_exec,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM_SEQ_INNER,
+                               REDUCE_POL,
+                               POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::For<
+      0,
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::statement::For<1,
+                           typename camp::at<POLICY_DATA, camp::num<1>>::type,
+                           RAJA::statement::For<2,
+                                                RAJA::seq_exec,
+                                                RAJA::statement::Lambda<0>>>>>;
 };
 
 #if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-            RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
-              RAJA::statement::Lambda<0>
-            >
-          >
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM,
+                               REDUCE_POL,
+                               POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::For<
+                  2,
+                  typename camp::at<POLICY_DATA, camp::num<2>>::type,
+                  RAJA::statement::Lambda<0>>>>>  // end DEVICE_KERNEL
+                         >;
 };
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<0, RAJA::seq_exec,
-          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-            RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
-              RAJA::statement::Lambda<0>
-            >
-          >
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER,
+                               REDUCE_POL,
+                               POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          0,
+          RAJA::seq_exec,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::For<
+                  2,
+                  typename camp::at<POLICY_DATA, camp::num<2>>::type,
+                  RAJA::statement::Lambda<0>>>>>  // end DEVICE_KERNEL
+                         >;
 };
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-            RAJA::statement::For<2, RAJA::seq_exec,
-              RAJA::statement::Lambda<0>
-            >
-          >
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER,
+                               REDUCE_POL,
+                               POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::For<
+                  2,
+                  RAJA::seq_exec,
+                  RAJA::statement::Lambda<0>>>>>  // end DEVICE_KERNEL
+                         >;
 };
 
 #endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-BlockReduceSum.hpp b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-BlockReduceSum.hpp
index cb2f444643..b2363c468b 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-BlockReduceSum.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-BlockReduceSum.hpp
@@ -17,11 +17,13 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopBlockReduceSumTest);
 template <typename T>
-class KernelNestedLoopBlockReduceSumTest : public ::testing::Test {};
+class KernelNestedLoopBlockReduceSumTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -31,13 +33,16 @@ TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename BlockNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename BlockNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = false;
 
   // For double nested loop tests the third arg is ignored.
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(LOOP_TYPE(), 1023);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(LOOP_TYPE(), 2345);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 1023);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 2345);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBlockReduceSumTest,
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-ReduceSum.hpp b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-ReduceSum.hpp
index 344ae26666..aea740d451 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-ReduceSum.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-ReduceSum.hpp
@@ -17,11 +17,13 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopReduceSumTest);
 template <typename T>
-class KernelNestedLoopReduceSumTest : public ::testing::Test {};
+class KernelNestedLoopReduceSumTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -31,13 +33,16 @@ TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename ReduceSumNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename ReduceSumNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = false;
 
   // For double nested loop tests the third arg is ignored.
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 1,1,1);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 40,30,20);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 1, 1, 1);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 40, 30, 20);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopReduceSumTest,
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-BlockReduceSum.hpp b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-BlockReduceSum.hpp
index a83c16592f..7ecdf9252f 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-BlockReduceSum.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-BlockReduceSum.hpp
@@ -17,11 +17,13 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopBlockReduceSumTest);
 template <typename T>
-class KernelNestedLoopBlockReduceSumTest : public ::testing::Test {};
+class KernelNestedLoopBlockReduceSumTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -31,13 +33,16 @@ TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename BlockNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename BlockNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = true;
 
   // For double nested loop tests the third arg is ignored.
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(LOOP_TYPE(), 1023);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(LOOP_TYPE(), 2345);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 1023);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 2345);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBlockReduceSumTest,
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-ReduceSum.hpp b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-ReduceSum.hpp
index bbf888f680..b96edb880a 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-ReduceSum.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-ReduceSum.hpp
@@ -17,11 +17,13 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopReduceSumTest);
 template <typename T>
-class KernelNestedLoopReduceSumTest : public ::testing::Test {};
+class KernelNestedLoopReduceSumTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -31,13 +33,16 @@ TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename ReduceSumNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename ReduceSumNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = true;
 
   // For double nested loop tests the third arg is ignored.
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 1,1,1);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 40,30,20);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 1, 1, 1);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 40, 30, 20);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopReduceSumTest,
diff --git a/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp b/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp
index 1c1eafabc5..a34688ef8d 100644
--- a/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp
+++ b/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp
@@ -19,21 +19,22 @@
 
 template <typename IDX_TYPE, typename DATA_TYPE, typename EXEC_POLICY>
 void KernelNestedLoopsSegmentTypesTestImpl(
-  const RAJA::TypedRangeSegment<IDX_TYPE>& s1, 
-  const std::vector<IDX_TYPE>& s1_idx,
-  const RAJA::TypedRangeStrideSegment<IDX_TYPE>& s2,
-  const std::vector<IDX_TYPE>& s2_idx,
-  const RAJA::TypedListSegment<IDX_TYPE>& s3,
-  const std::vector<IDX_TYPE>& s3_idx,
-  camp::resources::Resource working_res,
-  int perm)
+    const RAJA::TypedRangeSegment<IDX_TYPE>& s1,
+    const std::vector<IDX_TYPE>& s1_idx,
+    const RAJA::TypedRangeStrideSegment<IDX_TYPE>& s2,
+    const std::vector<IDX_TYPE>& s2_idx,
+    const RAJA::TypedListSegment<IDX_TYPE>& s3,
+    const std::vector<IDX_TYPE>& s3_idx,
+    camp::resources::Resource working_res,
+    int perm)
 {
   IDX_TYPE idx1_len = static_cast<IDX_TYPE>(s1_idx.size());
   IDX_TYPE idx2_len = static_cast<IDX_TYPE>(s2_idx.size());
   IDX_TYPE idx3_len = static_cast<IDX_TYPE>(s3_idx.size());
 
   bool zero_legth_segment = false;
-  if ( RAJA::stripIndexType(idx1_len * idx2_len * idx3_len) == 0 ) {
+  if (RAJA::stripIndexType(idx1_len * idx2_len * idx3_len) == 0)
+  {
     zero_legth_segment = true;
   }
 
@@ -41,7 +42,8 @@ void KernelNestedLoopsSegmentTypesTestImpl(
   IDX_TYPE dim2 = 1;
   IDX_TYPE dim3 = 1;
 
-  if ( !zero_legth_segment ) {
+  if (!zero_legth_segment)
+  {
     dim1 = s1_idx[s1_idx.size() - 1] + 1;
     dim2 = s2_idx[s2_idx.size() - 1] + 1;
     dim3 = s3_idx[s3_idx.size() - 1] + 1;
@@ -53,76 +55,82 @@ void KernelNestedLoopsSegmentTypesTestImpl(
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &work_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &work_array,
+                                    &check_array, &test_array);
 
-  RAJA::View< DATA_TYPE, RAJA::Layout<3> > work_view(work_array, 
-                                                     dim1, dim2, dim3);
-  RAJA::View< DATA_TYPE, RAJA::Layout<3> > test_view(test_array, 
-                                                     dim1, dim2, dim3);
+  RAJA::View<DATA_TYPE, RAJA::Layout<3>> work_view(work_array, dim1, dim2,
+                                                   dim3);
+  RAJA::View<DATA_TYPE, RAJA::Layout<3>> test_view(test_array, dim1, dim2,
+                                                   dim3);
 
-  memset( static_cast<void*>(test_array), 0, 
-          sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len) );
+  memset(static_cast<void*>(test_array), 0,
+         sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  working_res.memcpy(work_array, test_array, 
+  working_res.memcpy(work_array, test_array,
                      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  if ( !zero_legth_segment ) {
-    for (IDX_TYPE i1 = 0; i1 < idx1_len; ++i1) {
-      for (IDX_TYPE i2 = 0; i2 < idx2_len; ++i2) {
-        for (IDX_TYPE i3 = 0; i3 < idx3_len; ++i3) {
+  if (!zero_legth_segment)
+  {
+    for (IDX_TYPE i1 = 0; i1 < idx1_len; ++i1)
+    {
+      for (IDX_TYPE i2 = 0; i2 < idx2_len; ++i2)
+      {
+        for (IDX_TYPE i3 = 0; i3 < idx3_len; ++i3)
+        {
           auto ii1 = RAJA::stripIndexType(i1);
           auto ii2 = RAJA::stripIndexType(i2);
           auto ii3 = RAJA::stripIndexType(i3);
-          test_view( s1_idx[ii1], s2_idx[ii2], s3_idx[ii3] ) = 
-            static_cast<DATA_TYPE>( RAJA::stripIndexType(
-                                    s1_idx[ii1] + s2_idx[ii2] + s3_idx[ii3]) );
+          test_view(s1_idx[ii1], s2_idx[ii2], s3_idx[ii3]) =
+              static_cast<DATA_TYPE>(RAJA::stripIndexType(
+                  s1_idx[ii1] + s2_idx[ii2] + s3_idx[ii3]));
         }
       }
     }
   }
 
-  if ( perm == 1 ) {
-    RAJA::kernel<EXEC_POLICY>( RAJA::make_tuple( s1, s2, s3 ),
-      [=] RAJA_HOST_DEVICE (IDX_TYPE i1, IDX_TYPE i2, IDX_TYPE i3) {
-        work_view(i1, i2, i3) = 
-          static_cast<DATA_TYPE>( RAJA::stripIndexType(i1 + i2 + i3) );
-      }
-    );
+  if (perm == 1)
+  {
+    RAJA::kernel<EXEC_POLICY>(
+        RAJA::make_tuple(s1, s2, s3),
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i1, IDX_TYPE i2, IDX_TYPE i3)
+        {
+          work_view(i1, i2, i3) =
+              static_cast<DATA_TYPE>(RAJA::stripIndexType(i1 + i2 + i3));
+        });
+  }
+
+  if (perm == 2)
+  {
+    RAJA::kernel<EXEC_POLICY>(
+        RAJA::make_tuple(s2, s3, s1),
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i2, IDX_TYPE i3, IDX_TYPE i1)
+        {
+          work_view(i1, i2, i3) =
+              static_cast<DATA_TYPE>(RAJA::stripIndexType(i1 + i2 + i3));
+        });
   }
- 
-  if ( perm == 2 ) {
-    RAJA::kernel<EXEC_POLICY>( RAJA::make_tuple( s2, s3, s1 ),
-      [=] RAJA_HOST_DEVICE (IDX_TYPE i2, IDX_TYPE i3, IDX_TYPE i1) {
-        work_view(i1, i2, i3) = 
-          static_cast<DATA_TYPE>( RAJA::stripIndexType(i1 + i2 + i3) );
-      }
-    );
-  } 
-
-  if ( perm == 3 ) {
-    RAJA::kernel<EXEC_POLICY>( RAJA::make_tuple( s3, s1, s2 ),
-      [=] RAJA_HOST_DEVICE (IDX_TYPE i3, IDX_TYPE i1, IDX_TYPE i2) {
-        work_view(i1, i2, i3) = 
-          static_cast<DATA_TYPE>( RAJA::stripIndexType(i1 + i2 + i3) );
-      }
-    );
-  } 
 
-  working_res.memcpy(check_array, work_array, 
+  if (perm == 3)
+  {
+    RAJA::kernel<EXEC_POLICY>(
+        RAJA::make_tuple(s3, s1, s2),
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i3, IDX_TYPE i1, IDX_TYPE i2)
+        {
+          work_view(i1, i2, i3) =
+              static_cast<DATA_TYPE>(RAJA::stripIndexType(i1 + i2 + i3));
+        });
+  }
+
+  working_res.memcpy(check_array, work_array,
                      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
     auto ii = RAJA::stripIndexType(i);
-    ASSERT_EQ( test_array[ii], check_array[ii] );
+    ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      work_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, work_array, check_array,
                                       test_array);
 }
 
@@ -130,8 +138,7 @@ void KernelNestedLoopsSegmentTypesTestImpl(
 TYPED_TEST_SUITE_P(KernelNestedLoopsSegmentTypesTest);
 template <typename T>
 class KernelNestedLoopsSegmentTypesTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelNestedLoopsSegmentTypesTest, NestedLoopsSegmentTypesKernel)
 {
@@ -139,144 +146,96 @@ TYPED_TEST_P(KernelNestedLoopsSegmentTypesTest, NestedLoopsSegmentTypesKernel)
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> s1_idx;
   std::vector<IDX_TYPE> s2_idx;
   std::vector<IDX_TYPE> s3_idx;
 
-// Create a segment of each basic type RAJA provides and test
-// permutations of those segments in nested loops 
+  // Create a segment of each basic type RAJA provides and test
+  // permutations of those segments in nested loops
 
-  RAJA::TypedRangeSegment<IDX_TYPE> s1( 0, 69 );
+  RAJA::TypedRangeSegment<IDX_TYPE> s1(0, 69);
   RAJA::getIndices(s1_idx, s1);
 
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> s2( 3, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> s2(3, 188, 2);
   RAJA::getIndices(s2_idx, s2);
 
   IDX_TYPE last = IDX_TYPE(427);
-  srand( time(NULL) );
-  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       s3_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> s3( &s3_idx[0], s3_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> s3(&s3_idx[0], s3_idx.size(), working_res);
 
   int perm = 1;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s1, s1_idx,    
-                                        s2, s2_idx,    
-                                        s3, s3_idx,
-                                        working_res,
-                                        perm);
+      s1, s1_idx, s2, s2_idx, s3, s3_idx, working_res, perm);
 
   perm = 2;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s1, s1_idx,
-                                        s2, s2_idx,
-                                        s3, s3_idx,
-                                        working_res,
-                                        perm);
+      s1, s1_idx, s2, s2_idx, s3, s3_idx, working_res, perm);
 
   perm = 3;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s1, s1_idx,
-                                        s2, s2_idx,
-                                        s3, s3_idx,
-                                        working_res,
-                                        perm);
+      s1, s1_idx, s2, s2_idx, s3, s3_idx, working_res, perm);
 
-// Test some zero-length segment combinations
+  // Test some zero-length segment combinations
 
-// Zero-length range segment
-  RAJA::TypedRangeSegment<IDX_TYPE> s4( 4, 4 );
+  // Zero-length range segment
+  RAJA::TypedRangeSegment<IDX_TYPE> s4(4, 4);
   std::vector<IDX_TYPE> s4_idx;
   RAJA::getIndices(s4_idx, s4);
 
   perm = 1;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s4, s4_idx,
-                                        s2, s2_idx,
-                                        s3, s3_idx,
-                                        working_res,
-                                        perm);
+      s4, s4_idx, s2, s2_idx, s3, s3_idx, working_res, perm);
 
   perm = 2;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s4, s4_idx,
-                                        s2, s2_idx,
-                                        s3, s3_idx,
-                                        working_res,
-                                        perm);
+      s4, s4_idx, s2, s2_idx, s3, s3_idx, working_res, perm);
 
   perm = 3;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s4, s4_idx,
-                                        s2, s2_idx,
-                                        s3, s3_idx,
-                                        working_res,
-                                        perm);
-
-// Zero-length range stride segment
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> s5( 3, 3, 2 );
+      s4, s4_idx, s2, s2_idx, s3, s3_idx, working_res, perm);
+
+  // Zero-length range stride segment
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> s5(3, 3, 2);
   std::vector<IDX_TYPE> s5_idx;
   RAJA::getIndices(s5_idx, s5);
 
   perm = 1;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s1, s1_idx,
-                                        s5, s5_idx,
-                                        s3, s3_idx,
-                                        working_res,
-                                        perm);
+      s1, s1_idx, s5, s5_idx, s3, s3_idx, working_res, perm);
 
   perm = 2;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s1, s1_idx,
-                                        s5, s5_idx,
-                                        s3, s3_idx,
-                                        working_res,
-                                        perm);
+      s1, s1_idx, s5, s5_idx, s3, s3_idx, working_res, perm);
 
   perm = 3;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s1, s1_idx,
-                                        s5, s5_idx,
-                                        s3, s3_idx,
-                                        working_res,
-                                        perm);
+      s1, s1_idx, s5, s5_idx, s3, s3_idx, working_res, perm);
 
-// Zero-length list segment 
+  // Zero-length list segment
   std::vector<IDX_TYPE> s6_idx;
-  RAJA::TypedListSegment<IDX_TYPE> s6( nullptr, s6_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> s6(nullptr, s6_idx.size(), working_res);
 
   perm = 1;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s1, s1_idx,
-                                        s2, s2_idx,
-                                        s6, s6_idx,
-                                        working_res,
-                                        perm);
+      s1, s1_idx, s2, s2_idx, s6, s6_idx, working_res, perm);
 
   perm = 2;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s1, s1_idx,
-                                        s2, s2_idx,
-                                        s6, s6_idx,
-                                        working_res,
-                                        perm);
+      s1, s1_idx, s2, s2_idx, s6, s6_idx, working_res, perm);
 
   perm = 3;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s1, s1_idx,
-                                        s2, s2_idx,
-                                        s6, s6_idx,
-                                        working_res,
-                                        perm);
+      s1, s1_idx, s2, s2_idx, s6, s6_idx, working_res, perm);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopsSegmentTypesTest,
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp
index 77f168ce2f..76089d813d 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp
@@ -14,7 +14,7 @@ void KernelOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
                                 std::array<RAJA::idx_t, 2> offset_lo,
                                 std::array<RAJA::idx_t, 2> offset_hi)
 {
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   IDX_TYPE* working_array;
   IDX_TYPE* check_array;
   IDX_TYPE* test_array;
@@ -23,51 +23,45 @@ void KernelOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
 
   RAJA::idx_t off_dim0 = offset_hi.at(0) - offset_lo.at(0);
   RAJA::idx_t off_dim1 = offset_hi.at(1) - offset_lo.at(1);
-  EXPECT_LT( off_dim0, dim.at(0) );
-  EXPECT_LT( off_dim1, dim.at(1) );
+  EXPECT_LT(off_dim0, dim.at(0));
+  EXPECT_LT(off_dim1, dim.at(1));
 
-  allocateForallTestData<IDX_TYPE>(N,
-                                   working_res,
-                                   &working_array,
-                                   &check_array,
+  allocateForallTestData<IDX_TYPE>(N, working_res, &working_array, &check_array,
                                    &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(IDX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(IDX_TYPE) * N);
 
-  for (RAJA::idx_t i = 0; i < off_dim0; ++i) {
-    for (RAJA::idx_t j = 0; j < off_dim1; ++j) {
+  for (RAJA::idx_t i = 0; i < off_dim0; ++i)
+  {
+    for (RAJA::idx_t j = 0; j < off_dim1; ++j)
+    {
       test_array[j + dim.at(1) * i] = static_cast<IDX_TYPE>(1);
     }
   }
 
 
-  RAJA::OffsetLayout<2> layout =
-    RAJA::make_offset_layout<2>( {{offset_lo.at(0), offset_lo.at(1)}},
-                                 {{offset_lo.at(0) + dim.at(0),
-                                   offset_lo.at(1) + dim.at(1)}} );
-  RAJA::View< IDX_TYPE, RAJA::OffsetLayout<2> > view(working_array, layout);
+  RAJA::OffsetLayout<2> layout = RAJA::make_offset_layout<2>(
+      {{offset_lo.at(0), offset_lo.at(1)}},
+      {{offset_lo.at(0) + dim.at(0), offset_lo.at(1) + dim.at(1)}});
+  RAJA::View<IDX_TYPE, RAJA::OffsetLayout<2>> view(working_array, layout);
 
-  RAJA::TypedRangeSegment<IDX_TYPE> iseg( offset_lo.at(0), offset_hi.at(0));
-  RAJA::TypedRangeSegment<IDX_TYPE> jseg( offset_lo.at(1), offset_hi.at(1));
+  RAJA::TypedRangeSegment<IDX_TYPE> iseg(offset_lo.at(0), offset_hi.at(0));
+  RAJA::TypedRangeSegment<IDX_TYPE> jseg(offset_lo.at(1), offset_hi.at(1));
 
-  RAJA::kernel<EXEC_POLICY>(
-    RAJA::make_tuple( iseg, jseg ),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j) {
-      view(i, j) = static_cast<IDX_TYPE>(1);
-    }
-  );
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(iseg, jseg),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j)
+                            { view(i, j) = static_cast<IDX_TYPE>(1); });
 
   working_res.memcpy(check_array, working_array, sizeof(IDX_TYPE) * N);
 
-  for (RAJA::idx_t ii = 0; ii < N; ++ii) {
+  for (RAJA::idx_t ii = 0; ii < N; ++ii)
+  {
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res,
-                                     working_array,
-                                     check_array,
+  deallocateForallTestData<IDX_TYPE>(working_res, working_array, check_array,
                                      test_array);
 }
 
@@ -75,8 +69,7 @@ void KernelOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
 TYPED_TEST_SUITE_P(KernelNestedLoopOffsetView2DTest);
 template <typename T>
 class KernelNestedLoopOffsetView2DTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(KernelNestedLoopOffsetView2DTest, OffsetView2DKernelTest)
@@ -94,30 +87,26 @@ TYPED_TEST_P(KernelNestedLoopOffsetView2DTest, OffsetView2DKernelTest)
   // Square views
   //
   std::array<RAJA::idx_t, 2> offset_lo {{0, 2}};
-  std::array<RAJA::idx_t, 2> offset_hi {{dim0-3, dim1-4}};
-  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
-                                                                 offset_lo,
+  std::array<RAJA::idx_t, 2> offset_hi {{dim0 - 3, dim1 - 4}};
+  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
                                                                  offset_hi);
 
   offset_lo = std::array<RAJA::idx_t, 2> {{-1, -2}};
-  offset_hi = std::array<RAJA::idx_t, 2> {{dim0-3, dim1-6}};
-  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
-                                                                 offset_lo,
+  offset_hi = std::array<RAJA::idx_t, 2> {{dim0 - 3, dim1 - 6}};
+  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
                                                                  offset_hi);
 
   //
   // Non-square views
   //
   offset_lo = std::array<RAJA::idx_t, 2> {{0, 1}};
-  offset_hi = std::array<RAJA::idx_t, 2> {{dim0-3, dim1-1}};
-  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
-                                                                 offset_lo,
+  offset_hi = std::array<RAJA::idx_t, 2> {{dim0 - 3, dim1 - 1}};
+  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
                                                                  offset_hi);
 
   offset_lo = std::array<RAJA::idx_t, 2> {{-1, -1}};
-  offset_hi = std::array<RAJA::idx_t, 2> {{dim0-3, dim1-4}};
-  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
-                                                                 offset_lo,
+  offset_hi = std::array<RAJA::idx_t, 2> {{dim0 - 3, dim1 - 4}};
+  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
                                                                  offset_hi);
 }
 
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp
index 32adc3ede0..60c335154a 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp
@@ -14,7 +14,7 @@ void KernelOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
                                 std::array<RAJA::idx_t, 3> offset_lo,
                                 std::array<RAJA::idx_t, 3> offset_hi)
 {
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   IDX_TYPE* working_array;
   IDX_TYPE* check_array;
   IDX_TYPE* test_array;
@@ -24,60 +24,54 @@ void KernelOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
   RAJA::idx_t off_dim0 = offset_hi.at(0) - offset_lo.at(0);
   RAJA::idx_t off_dim1 = offset_hi.at(1) - offset_lo.at(1);
   RAJA::idx_t off_dim2 = offset_hi.at(2) - offset_lo.at(2);
-  EXPECT_LT( off_dim0, dim.at(0) );
-  EXPECT_LT( off_dim1, dim.at(1) );
-  EXPECT_LT( off_dim2, dim.at(2) );
-
-  allocateForallTestData<IDX_TYPE>(N,
-                                   working_res,
-                                   &working_array,
-                                   &check_array,
+  EXPECT_LT(off_dim0, dim.at(0));
+  EXPECT_LT(off_dim1, dim.at(1));
+  EXPECT_LT(off_dim2, dim.at(2));
+
+  allocateForallTestData<IDX_TYPE>(N, working_res, &working_array, &check_array,
                                    &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(IDX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(IDX_TYPE) * N);
 
-  for (RAJA::idx_t i = 0; i < off_dim0; ++i) {
-    for (RAJA::idx_t j = 0; j < off_dim1; ++j) {
-      for (RAJA::idx_t k = 0; k < off_dim2; ++k) {
+  for (RAJA::idx_t i = 0; i < off_dim0; ++i)
+  {
+    for (RAJA::idx_t j = 0; j < off_dim1; ++j)
+    {
+      for (RAJA::idx_t k = 0; k < off_dim2; ++k)
+      {
         test_array[k + dim.at(2) * j + dim.at(1) * dim.at(2) * i] =
-          static_cast<IDX_TYPE>(1);
+            static_cast<IDX_TYPE>(1);
       }
     }
   }
 
 
-  RAJA::OffsetLayout<3> layout =
-    RAJA::make_offset_layout<3>( {{offset_lo.at(0),
-                                   offset_lo.at(1),
-                                   offset_lo.at(2)}},
-                                 {{offset_lo.at(0) + dim.at(0),
-                                   offset_lo.at(1) + dim.at(1),
-                                   offset_lo.at(2) + dim.at(2)}} );
+  RAJA::OffsetLayout<3> layout = RAJA::make_offset_layout<3>(
+      {{offset_lo.at(0), offset_lo.at(1), offset_lo.at(2)}},
+      {{offset_lo.at(0) + dim.at(0), offset_lo.at(1) + dim.at(1),
+        offset_lo.at(2) + dim.at(2)}});
 
-  RAJA::View< IDX_TYPE, RAJA::OffsetLayout<3> > view(working_array, layout);
+  RAJA::View<IDX_TYPE, RAJA::OffsetLayout<3>> view(working_array, layout);
 
-  RAJA::TypedRangeSegment<IDX_TYPE> iseg( offset_lo.at(0), offset_hi.at(0));
-  RAJA::TypedRangeSegment<IDX_TYPE> jseg( offset_lo.at(1), offset_hi.at(1));
-  RAJA::TypedRangeSegment<IDX_TYPE> kseg( offset_lo.at(2), offset_hi.at(2));
+  RAJA::TypedRangeSegment<IDX_TYPE> iseg(offset_lo.at(0), offset_hi.at(0));
+  RAJA::TypedRangeSegment<IDX_TYPE> jseg(offset_lo.at(1), offset_hi.at(1));
+  RAJA::TypedRangeSegment<IDX_TYPE> kseg(offset_lo.at(2), offset_hi.at(2));
 
   RAJA::kernel<EXEC_POLICY>(
-    RAJA::make_tuple( iseg, jseg, kseg ),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k) {
-      view(i, j, k) = static_cast<IDX_TYPE>(1);
-    }
-  );
+      RAJA::make_tuple(iseg, jseg, kseg),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k)
+      { view(i, j, k) = static_cast<IDX_TYPE>(1); });
 
   working_res.memcpy(check_array, working_array, sizeof(IDX_TYPE) * N);
 
-  for (RAJA::idx_t ii = 0; ii < N; ++ii) {
+  for (RAJA::idx_t ii = 0; ii < N; ++ii)
+  {
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res,
-                                     working_array,
-                                     check_array,
+  deallocateForallTestData<IDX_TYPE>(working_res, working_array, check_array,
                                      test_array);
 }
 
@@ -85,8 +79,7 @@ void KernelOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
 TYPED_TEST_SUITE_P(KernelNestedLoopOffsetView3DTest);
 template <typename T>
 class KernelNestedLoopOffsetView3DTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(KernelNestedLoopOffsetView3DTest, OffsetView3DKernelTest)
@@ -105,30 +98,26 @@ TYPED_TEST_P(KernelNestedLoopOffsetView3DTest, OffsetView3DKernelTest)
   // Square views
   //
   std::array<RAJA::idx_t, 3> offset_lo {{0, 2, 1}};
-  std::array<RAJA::idx_t, 3> offset_hi {{dim0-2, dim1-6, dim2-4}};
-  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
-                                                                 offset_lo,
+  std::array<RAJA::idx_t, 3> offset_hi {{dim0 - 2, dim1 - 6, dim2 - 4}};
+  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
                                                                  offset_hi);
 
   offset_lo = std::array<RAJA::idx_t, 3> {{-1, -2, -3}};
-  offset_hi = std::array<RAJA::idx_t, 3> {{dim0-3, dim1-10, dim2-8}};
-  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
-                                                                 offset_lo,
+  offset_hi = std::array<RAJA::idx_t, 3> {{dim0 - 3, dim1 - 10, dim2 - 8}};
+  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
                                                                  offset_hi);
 
   //
   // Non-square views
   //
   offset_lo = std::array<RAJA::idx_t, 3> {{0, 1, 2}};
-  offset_hi = std::array<RAJA::idx_t, 3> {{dim0-3, dim1-2, dim2-2}};
-  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
-                                                                 offset_lo,
+  offset_hi = std::array<RAJA::idx_t, 3> {{dim0 - 3, dim1 - 2, dim2 - 2}};
+  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
                                                                  offset_hi);
 
   offset_lo = std::array<RAJA::idx_t, 3> {{-1, -1, 0}};
-  offset_hi = std::array<RAJA::idx_t, 3> {{dim0-3, dim1-4, dim2-2}};
-  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
-                                                                 offset_lo,
+  offset_hi = std::array<RAJA::idx_t, 3> {{dim0 - 3, dim1 - 4, dim2 - 2}};
+  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
                                                                  offset_hi);
 }
 
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp
index f83126959d..1888c93016 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp
@@ -12,7 +12,7 @@ template <typename IDX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
                                         std::array<RAJA::idx_t, 2> perm)
 {
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   IDX_TYPE* A_work_array;
   IDX_TYPE* A_check_array;
   IDX_TYPE* A_test_array;
@@ -35,8 +35,8 @@ void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
   //
   // Also, we assume a finite difference stencil width of one.
   //
-  RAJA::idx_t Nint_outer = dim.at( perm.at(0) );
-  RAJA::idx_t Nint_inner = dim.at( perm.at(1) );
+  RAJA::idx_t Nint_outer = dim.at(perm.at(0));
+  RAJA::idx_t Nint_inner = dim.at(perm.at(1));
 
   RAJA::idx_t Ntot_outer = Nint_outer + 2 * 1;
   RAJA::idx_t Ntot_inner = Nint_inner + 2 * 1;
@@ -45,16 +45,15 @@ void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
   RAJA::idx_t Ntot = Ntot_outer * Ntot_inner;
 
 
-  allocateForallTestData<IDX_TYPE>(Ntot,
-                                   working_res,
-                                   &B_work_array,
-                                   &B_check_array,
-                                   &B_test_array);
+  allocateForallTestData<IDX_TYPE>(Ntot, working_res, &B_work_array,
+                                   &B_check_array, &B_test_array);
 
   memset(static_cast<void*>(B_test_array), 0, sizeof(IDX_TYPE) * Ntot);
 
-  for (RAJA::idx_t i = 1; i <= Nint_outer; ++i) {
-    for (RAJA::idx_t j = 1; j <= Nint_inner; ++j) {
+  for (RAJA::idx_t i = 1; i <= Nint_outer; ++i)
+  {
+    for (RAJA::idx_t j = 1; j <= Nint_inner; ++j)
+    {
       B_test_array[j + Ntot_inner * i] = static_cast<IDX_TYPE>(1);
     }
   }
@@ -63,70 +62,61 @@ void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
   working_res.memcpy(B_work_array, B_test_array, sizeof(IDX_TYPE) * Ntot);
 
 
-  allocateForallTestData<IDX_TYPE>(Nint,
-                                   working_res,
-                                   &A_work_array,
-                                   &A_check_array,
-                                   &A_test_array);
+  allocateForallTestData<IDX_TYPE>(Nint, working_res, &A_work_array,
+                                   &A_check_array, &A_test_array);
 
   memset(static_cast<void*>(A_test_array), 0, sizeof(IDX_TYPE) * Nint);
 
   working_res.memcpy(A_work_array, A_test_array, sizeof(IDX_TYPE) * Nint);
 
-  for (RAJA::idx_t i = 0; i < Nint_outer; ++i) {
-    for (RAJA::idx_t j = 0; j < Nint_inner; ++j) {
+  for (RAJA::idx_t i = 0; i < Nint_outer; ++i)
+  {
+    for (RAJA::idx_t j = 0; j < Nint_inner; ++j)
+    {
 
       int A_idx = j + Nint_inner * i;
       int B_idx = (j + 1) + Ntot_inner * (i + 1);
 
-      A_test_array[A_idx] = B_test_array[B_idx] +                // C
-                            B_test_array[B_idx - Ntot_inner] +   // S
-                            B_test_array[B_idx + Ntot_inner] +   // N
-                            B_test_array[B_idx - 1] +            // W
-                            B_test_array[B_idx + 1];             // E
-
+      A_test_array[A_idx] = B_test_array[B_idx] +               // C
+                            B_test_array[B_idx - Ntot_inner] +  // S
+                            B_test_array[B_idx + Ntot_inner] +  // N
+                            B_test_array[B_idx - 1] +           // W
+                            B_test_array[B_idx + 1];            // E
     }
   }
 
 
-  RAJA::OffsetLayout<2> B_layout =
-    RAJA::make_permuted_offset_layout<2>( {{-1, -1}},
-                                          {{Ntot_len.at(0)-1, Ntot_len.at(1)-1}},
-                                          perm );
+  RAJA::OffsetLayout<2> B_layout = RAJA::make_permuted_offset_layout<2>(
+      {{-1, -1}}, {{Ntot_len.at(0) - 1, Ntot_len.at(1) - 1}}, perm);
   RAJA::Layout<2> A_layout =
-    RAJA::make_permuted_layout( {{Nint_len.at(0), Nint_len.at(1)}}, perm );
-
-  RAJA::View< IDX_TYPE, RAJA::OffsetLayout<2> > B_view(B_work_array, B_layout);
-  RAJA::View< IDX_TYPE, RAJA::Layout<2> >  A_view(A_work_array, A_layout);
+      RAJA::make_permuted_layout({{Nint_len.at(0), Nint_len.at(1)}}, perm);
 
-  RAJA::TypedRangeSegment<IDX_TYPE> iseg( 0, Nint_len.at(0) );
-  RAJA::TypedRangeSegment<IDX_TYPE> jseg( 0, Nint_len.at(1) );
+  RAJA::View<IDX_TYPE, RAJA::OffsetLayout<2>> B_view(B_work_array, B_layout);
+  RAJA::View<IDX_TYPE, RAJA::Layout<2>> A_view(A_work_array, A_layout);
 
-  RAJA::kernel<EXEC_POLICY>(
-    RAJA::make_tuple( iseg, jseg ),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j) {
+  RAJA::TypedRangeSegment<IDX_TYPE> iseg(0, Nint_len.at(0));
+  RAJA::TypedRangeSegment<IDX_TYPE> jseg(0, Nint_len.at(1));
 
-      A_view(i, j) = B_view(i, j) +
-                     B_view(i - 1, j) + B_view(i + 1, j) +
-                     B_view(i, j - 1) + B_view(i, j + 1);
-
-    }
-  );
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(iseg, jseg),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j)
+                            {
+                              A_view(i, j) = B_view(i, j) + B_view(i - 1, j) +
+                                             B_view(i + 1, j) +
+                                             B_view(i, j - 1) +
+                                             B_view(i, j + 1);
+                            });
 
   working_res.memcpy(A_check_array, A_work_array, sizeof(IDX_TYPE) * Nint);
 
-  for (RAJA::idx_t ii = 0; ii < Nint; ++ii) {
+  for (RAJA::idx_t ii = 0; ii < Nint; ++ii)
+  {
     ASSERT_EQ(A_test_array[ii], A_check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res,
-                                     A_work_array,
-                                     A_check_array,
+  deallocateForallTestData<IDX_TYPE>(working_res, A_work_array, A_check_array,
                                      A_test_array);
 
-  deallocateForallTestData<IDX_TYPE>(working_res,
-                                     B_work_array,
-                                     B_check_array,
+  deallocateForallTestData<IDX_TYPE>(working_res, B_work_array, B_check_array,
                                      B_test_array);
 }
 
@@ -134,11 +124,11 @@ void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
 TYPED_TEST_SUITE_P(KernelNestedLoopPermutedOffsetView2DTest);
 template <typename T>
 class KernelNestedLoopPermutedOffsetView2DTest : public ::testing::Test
-{
-};
+{};
 
 
-TYPED_TEST_P(KernelNestedLoopPermutedOffsetView2DTest, PermutedOffsetView2DKernelTest)
+TYPED_TEST_P(KernelNestedLoopPermutedOffsetView2DTest,
+             PermutedOffsetView2DKernelTest)
 {
   using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp
index 776aff7c57..0448cf268a 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp
@@ -12,7 +12,7 @@ template <typename IDX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
                                         std::array<RAJA::idx_t, 3> perm)
 {
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   IDX_TYPE* A_work_array;
   IDX_TYPE* A_check_array;
   IDX_TYPE* A_test_array;
@@ -26,9 +26,8 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
   // Note that we assume a finite difference stencil width of one.
   //
   std::array<RAJA::idx_t, 3> Nint_len {{dim.at(0), dim.at(1), dim.at(2)}};
-  std::array<RAJA::idx_t, 3> Ntot_len {{dim.at(0) + 2 * 1,
-                                        dim.at(1) + 2 * 1,
-                                        dim.at(2) + 2 * 1}};
+  std::array<RAJA::idx_t, 3> Ntot_len {
+      {dim.at(0) + 2 * 1, dim.at(1) + 2 * 1, dim.at(2) + 2 * 1}};
 
   //
   // These are used in data initialization and setting reference solution.
@@ -37,9 +36,9 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
   //
   // Also, we assume a finite difference stencil width of one.
   //
-  RAJA::idx_t Nint_outer  = dim.at( perm.at(0) );
-  RAJA::idx_t Nint_middle = dim.at( perm.at(1) );
-  RAJA::idx_t Nint_inner  = dim.at( perm.at(2) );
+  RAJA::idx_t Nint_outer  = dim.at(perm.at(0));
+  RAJA::idx_t Nint_middle = dim.at(perm.at(1));
+  RAJA::idx_t Nint_inner  = dim.at(perm.at(2));
 
   RAJA::idx_t Ntot_outer  = Nint_outer + 2 * 1;
   RAJA::idx_t Ntot_middle = Nint_middle + 2 * 1;
@@ -49,19 +48,19 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
   RAJA::idx_t Ntot = Ntot_outer * Ntot_middle * Ntot_inner;
 
 
-  allocateForallTestData<IDX_TYPE>(Ntot,
-                                   working_res,
-                                   &B_work_array,
-                                   &B_check_array,
-                                   &B_test_array);
+  allocateForallTestData<IDX_TYPE>(Ntot, working_res, &B_work_array,
+                                   &B_check_array, &B_test_array);
 
   memset(static_cast<void*>(B_test_array), 0, sizeof(IDX_TYPE) * Ntot);
 
-  for (RAJA::idx_t i = 1; i <= Nint_outer; ++i) {
-    for (RAJA::idx_t j = 1; j <= Nint_middle; ++j) {
-      for (RAJA::idx_t k = 1; k <= Nint_inner; ++k) {
+  for (RAJA::idx_t i = 1; i <= Nint_outer; ++i)
+  {
+    for (RAJA::idx_t j = 1; j <= Nint_middle; ++j)
+    {
+      for (RAJA::idx_t k = 1; k <= Nint_inner; ++k)
+      {
         B_test_array[k + j * Ntot_inner + i * Ntot_inner * Ntot_middle] =
-          static_cast<IDX_TYPE>(1);
+            static_cast<IDX_TYPE>(1);
       }
     }
   }
@@ -70,80 +69,71 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
   working_res.memcpy(B_work_array, B_test_array, sizeof(IDX_TYPE) * Ntot);
 
 
-  allocateForallTestData<IDX_TYPE>(Nint,
-                                   working_res,
-                                   &A_work_array,
-                                   &A_check_array,
-                                   &A_test_array);
+  allocateForallTestData<IDX_TYPE>(Nint, working_res, &A_work_array,
+                                   &A_check_array, &A_test_array);
 
   memset(static_cast<void*>(A_test_array), 0, sizeof(IDX_TYPE) * Nint);
 
   working_res.memcpy(A_work_array, A_test_array, sizeof(IDX_TYPE) * Nint);
 
-  for (RAJA::idx_t i = 0; i < Nint_outer; ++i) {
-    for (RAJA::idx_t j = 0; j < Nint_middle; ++j) {
-      for (RAJA::idx_t k = 0; k < Nint_inner; ++k) {
+  for (RAJA::idx_t i = 0; i < Nint_outer; ++i)
+  {
+    for (RAJA::idx_t j = 0; j < Nint_middle; ++j)
+    {
+      for (RAJA::idx_t k = 0; k < Nint_inner; ++k)
+      {
 
         int A_idx = k + j * Nint_inner + i * Nint_inner * Nint_middle;
         int B_idx =
-          (k + 1) + (j + 1) * Ntot_inner + (i + 1) * Ntot_inner * Ntot_middle;
+            (k + 1) + (j + 1) * Ntot_inner + (i + 1) * Ntot_inner * Ntot_middle;
 
         A_test_array[A_idx] =
-          B_test_array[B_idx] +                              // C
-          B_test_array[B_idx - 1] +                          // W
-          B_test_array[B_idx + 1] +                          // E
-          B_test_array[B_idx - Ntot_inner] +                 // S
-          B_test_array[B_idx + Ntot_inner] +                 // N
-          B_test_array[B_idx - (Ntot_inner*Ntot_middle)] +   // B
-          B_test_array[B_idx + (Ntot_inner*Ntot_middle)];    // T
-
+            B_test_array[B_idx] +                               // C
+            B_test_array[B_idx - 1] +                           // W
+            B_test_array[B_idx + 1] +                           // E
+            B_test_array[B_idx - Ntot_inner] +                  // S
+            B_test_array[B_idx + Ntot_inner] +                  // N
+            B_test_array[B_idx - (Ntot_inner * Ntot_middle)] +  // B
+            B_test_array[B_idx + (Ntot_inner * Ntot_middle)];   // T
       }
     }
   }
 
 
-  RAJA::OffsetLayout<3> B_layout =
-    RAJA::make_permuted_offset_layout<3>( {{-1, -1, -1}},
-                                          {{Ntot_len.at(0)-1,
-                                            Ntot_len.at(1)-1,
-                                            Ntot_len.at(2)-1}},
-                                          perm );
-  RAJA::Layout<3> A_layout =
-    RAJA::make_permuted_layout( {{Nint_len.at(0),
-                                  Nint_len.at(1),
-                                  Nint_len.at(2)}}, perm );
+  RAJA::OffsetLayout<3> B_layout = RAJA::make_permuted_offset_layout<3>(
+      {{-1, -1, -1}},
+      {{Ntot_len.at(0) - 1, Ntot_len.at(1) - 1, Ntot_len.at(2) - 1}}, perm);
+  RAJA::Layout<3> A_layout = RAJA::make_permuted_layout(
+      {{Nint_len.at(0), Nint_len.at(1), Nint_len.at(2)}}, perm);
 
-  RAJA::View< IDX_TYPE, RAJA::OffsetLayout<3> > B_view(B_work_array, B_layout);
-  RAJA::View< IDX_TYPE, RAJA::Layout<3> >  A_view(A_work_array, A_layout);
+  RAJA::View<IDX_TYPE, RAJA::OffsetLayout<3>> B_view(B_work_array, B_layout);
+  RAJA::View<IDX_TYPE, RAJA::Layout<3>> A_view(A_work_array, A_layout);
 
-  RAJA::TypedRangeSegment<IDX_TYPE> iseg( 0, Nint_len.at(0) );
-  RAJA::TypedRangeSegment<IDX_TYPE> jseg( 0, Nint_len.at(1) );
-  RAJA::TypedRangeSegment<IDX_TYPE> kseg( 0, Nint_len.at(2) );
+  RAJA::TypedRangeSegment<IDX_TYPE> iseg(0, Nint_len.at(0));
+  RAJA::TypedRangeSegment<IDX_TYPE> jseg(0, Nint_len.at(1));
+  RAJA::TypedRangeSegment<IDX_TYPE> kseg(0, Nint_len.at(2));
 
   RAJA::kernel<EXEC_POLICY>(
-    RAJA::make_tuple( iseg, jseg, kseg ),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k) {
-      A_view(i, j, k) = B_view(i, j, k) +
-                        B_view(i - 1, j, k) + B_view(i + 1, j, k) +
-                        B_view(i, j - 1, k) + B_view(i, j + 1, k) +
-                        B_view(i, j, k - 1) + B_view(i, j, k + 1);
-    }
-  );
+      RAJA::make_tuple(iseg, jseg, kseg),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k)
+      {
+        A_view(i, j, k) = B_view(i, j, k) + B_view(i - 1, j, k) +
+                          B_view(i + 1, j, k) + B_view(i, j - 1, k) +
+                          B_view(i, j + 1, k) + B_view(i, j, k - 1) +
+                          B_view(i, j, k + 1);
+      });
 
   working_res.memcpy(A_check_array, A_work_array, sizeof(IDX_TYPE) * Nint);
 
-  for (RAJA::idx_t ii = 0; ii < Nint; ++ii) {
+  for (RAJA::idx_t ii = 0; ii < Nint; ++ii)
+  {
     ASSERT_EQ(A_test_array[ii], A_check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res,
-                                     A_work_array,
-                                     A_check_array,
+  deallocateForallTestData<IDX_TYPE>(working_res, A_work_array, A_check_array,
                                      A_test_array);
 
-  deallocateForallTestData<IDX_TYPE>(working_res,
-                                     B_work_array,
-                                     B_check_array,
+  deallocateForallTestData<IDX_TYPE>(working_res, B_work_array, B_check_array,
                                      B_test_array);
 }
 
@@ -151,11 +141,11 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
 TYPED_TEST_SUITE_P(KernelNestedLoopPermutedOffsetView3DTest);
 template <typename T>
 class KernelNestedLoopPermutedOffsetView3DTest : public ::testing::Test
-{
-};
+{};
 
 
-TYPED_TEST_P(KernelNestedLoopPermutedOffsetView3DTest, PermutedOffsetView3DKernelTest)
+TYPED_TEST_P(KernelNestedLoopPermutedOffsetView3DTest,
+             PermutedOffsetView3DKernelTest)
 {
   using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp
index 66311c43f1..37729d4e99 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp
@@ -13,52 +13,49 @@ template <typename IDX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void KernelPermutedView2DTestImpl(std::array<IDX_TYPE, 2> dim,
                                   std::array<RAJA::idx_t, 2> perm)
 {
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   IDX_TYPE* working_array;
   IDX_TYPE* check_array;
   IDX_TYPE* test_array;
 
-  std::array<RAJA::idx_t, 2>
-    dim_strip {{ static_cast<RAJA::idx_t>( RAJA::stripIndexType(dim.at(0)) ),
-                 static_cast<RAJA::idx_t>( RAJA::stripIndexType(dim.at(1)) ) }};
+  std::array<RAJA::idx_t, 2> dim_strip {
+      {static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(0))),
+       static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(1)))}};
   RAJA::idx_t N = dim_strip.at(0) * dim_strip.at(1);
 
-  allocateForallTestData<IDX_TYPE>(N,
-                                   working_res,
-                                   &working_array,
-                                   &check_array,
+  allocateForallTestData<IDX_TYPE>(N, working_res, &working_array, &check_array,
                                    &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(IDX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(IDX_TYPE) * N);
 
-  int mod_val = dim.at( perm.at(1) );
-  for (RAJA::idx_t ii = 0; ii < N; ++ii) {
+  int mod_val = dim.at(perm.at(1));
+  for (RAJA::idx_t ii = 0; ii < N; ++ii)
+  {
     test_array[ii] = static_cast<IDX_TYPE>(ii % mod_val);
   }
 
   RAJA::Layout<2> layout = RAJA::make_permuted_layout(dim_strip, perm);
-  RAJA::View< IDX_TYPE, RAJA::Layout<2, int> > view(working_array, layout);
+  RAJA::View<IDX_TYPE, RAJA::Layout<2, int>> view(working_array, layout);
 
   RAJA::kernel<EXEC_POLICY>(
-    RAJA::make_tuple( RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(0)),
-                      RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(1)) ),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j) {
-      int val = RAJA::stripIndexType(layout(i, j)) % mod_val;
-      view(i, j) = static_cast<IDX_TYPE>(val);
-    }
-  );
+      RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(0)),
+                       RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(1))),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j)
+      {
+        int val    = RAJA::stripIndexType(layout(i, j)) % mod_val;
+        view(i, j) = static_cast<IDX_TYPE>(val);
+      });
 
   working_res.memcpy(check_array, working_array, sizeof(IDX_TYPE) * N);
 
-  for (RAJA::idx_t ii = 0; ii < N; ++ii) {
+  for (RAJA::idx_t ii = 0; ii < N; ++ii)
+  {
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res,
-                                     working_array,
-                                     check_array,
+  deallocateForallTestData<IDX_TYPE>(working_res, working_array, check_array,
                                      test_array);
 }
 
@@ -66,8 +63,7 @@ void KernelPermutedView2DTestImpl(std::array<IDX_TYPE, 2> dim,
 TYPED_TEST_SUITE_P(KernelNestedLoopPermutedView2DTest);
 template <typename T>
 class KernelNestedLoopPermutedView2DTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(KernelNestedLoopPermutedView2DTest, PermutedView2DKernelTest)
@@ -81,8 +77,8 @@ TYPED_TEST_P(KernelNestedLoopPermutedView2DTest, PermutedView2DKernelTest)
   //
   // Square view
   //
-  std::array<IDX_TYPE, 2> dim_s  {{static_cast<IDX_TYPE>(21),
-                                   static_cast<IDX_TYPE>(21)}};
+  std::array<IDX_TYPE, 2> dim_s {
+      {static_cast<IDX_TYPE>(21), static_cast<IDX_TYPE>(21)}};
   KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_s, perm);
 
   perm = std::array<RAJA::idx_t, 2> {{1, 0}};
@@ -92,12 +88,14 @@ TYPED_TEST_P(KernelNestedLoopPermutedView2DTest, PermutedView2DKernelTest)
   //
   // Non-square view
   //
-  std::array<IDX_TYPE, 2> dim_ns  {{static_cast<IDX_TYPE>(15),
-                                    static_cast<IDX_TYPE>(24)}};
-  KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns, perm);
+  std::array<IDX_TYPE, 2> dim_ns {
+      {static_cast<IDX_TYPE>(15), static_cast<IDX_TYPE>(24)}};
+  KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
+                                                                   perm);
 
   perm = std::array<RAJA::idx_t, 2> {{1, 0}};
-  KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns, perm);
+  KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
+                                                                   perm);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopPermutedView2DTest,
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp
index c3cb31ddce..7c3c329bf3 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp
@@ -13,54 +13,51 @@ template <typename IDX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void KernelPermutedView3DTestImpl(std::array<IDX_TYPE, 3> dim,
                                   std::array<RAJA::idx_t, 3> perm)
 {
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   IDX_TYPE* working_array;
   IDX_TYPE* check_array;
   IDX_TYPE* test_array;
 
-  std::array<RAJA::idx_t, 3>
-    dim_strip {{ static_cast<RAJA::idx_t>( RAJA::stripIndexType(dim.at(0)) ),
-                 static_cast<RAJA::idx_t>( RAJA::stripIndexType(dim.at(1)) ),
-                 static_cast<RAJA::idx_t>( RAJA::stripIndexType(dim.at(2)) ) }};
+  std::array<RAJA::idx_t, 3> dim_strip {
+      {static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(0))),
+       static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(1))),
+       static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(2)))}};
   RAJA::idx_t N = dim_strip.at(0) * dim_strip.at(1) * dim_strip.at(2);
 
-  allocateForallTestData<IDX_TYPE>(N,
-                                   working_res,
-                                   &working_array,
-                                   &check_array,
+  allocateForallTestData<IDX_TYPE>(N, working_res, &working_array, &check_array,
                                    &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(IDX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(IDX_TYPE) * N);
 
-  int mod_val = dim.at( perm.at(1) ) * dim.at( perm.at(2) );
-  for (RAJA::idx_t ii = 0; ii < N; ++ii) {
+  int mod_val = dim.at(perm.at(1)) * dim.at(perm.at(2));
+  for (RAJA::idx_t ii = 0; ii < N; ++ii)
+  {
     test_array[ii] = static_cast<IDX_TYPE>(ii % mod_val);
   }
 
   RAJA::Layout<3> layout = RAJA::make_permuted_layout(dim_strip, perm);
-  RAJA::View< IDX_TYPE, RAJA::Layout<3, int> > view(working_array, layout);
+  RAJA::View<IDX_TYPE, RAJA::Layout<3, int>> view(working_array, layout);
 
   RAJA::kernel<EXEC_POLICY>(
-    RAJA::make_tuple( RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(0)),
-                      RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(1)),
-                      RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(2)) ),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k) {
-      int val = RAJA::stripIndexType(layout(i, j, k)) % mod_val;
-      view(i, j, k) = static_cast<IDX_TYPE>(val);
-    }
-  );
+      RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(0)),
+                       RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(1)),
+                       RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(2))),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k)
+      {
+        int val       = RAJA::stripIndexType(layout(i, j, k)) % mod_val;
+        view(i, j, k) = static_cast<IDX_TYPE>(val);
+      });
 
   working_res.memcpy(check_array, working_array, sizeof(IDX_TYPE) * N);
 
-  for (RAJA::idx_t ii = 0; ii < N; ++ii) {
+  for (RAJA::idx_t ii = 0; ii < N; ++ii)
+  {
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res,
-                                     working_array,
-                                     check_array,
+  deallocateForallTestData<IDX_TYPE>(working_res, working_array, check_array,
                                      test_array);
 }
 
@@ -68,8 +65,7 @@ void KernelPermutedView3DTestImpl(std::array<IDX_TYPE, 3> dim,
 TYPED_TEST_SUITE_P(KernelNestedLoopPermutedView3DTest);
 template <typename T>
 class KernelNestedLoopPermutedView3DTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(KernelNestedLoopPermutedView3DTest, PermutedView3DKernelTest)
@@ -83,9 +79,9 @@ TYPED_TEST_P(KernelNestedLoopPermutedView3DTest, PermutedView3DKernelTest)
   //
   // Square view
   //
-  std::array<IDX_TYPE, 3> dim_s  {{static_cast<IDX_TYPE>(21),
-                                   static_cast<IDX_TYPE>(21),
-                                   static_cast<IDX_TYPE>(21)}};
+  std::array<IDX_TYPE, 3> dim_s {{static_cast<IDX_TYPE>(21),
+                                  static_cast<IDX_TYPE>(21),
+                                  static_cast<IDX_TYPE>(21)}};
   KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_s, perm);
 
   perm = std::array<RAJA::idx_t, 3> {{1, 2, 0}};
@@ -98,16 +94,19 @@ TYPED_TEST_P(KernelNestedLoopPermutedView3DTest, PermutedView3DKernelTest)
   //
   // Non-square view
   //
-  std::array<IDX_TYPE, 3> dim_ns  {{static_cast<IDX_TYPE>(15),
-                                    static_cast<IDX_TYPE>(24),
-                                    static_cast<IDX_TYPE>(17)}};
-  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns, perm);
+  std::array<IDX_TYPE, 3> dim_ns {{static_cast<IDX_TYPE>(15),
+                                   static_cast<IDX_TYPE>(24),
+                                   static_cast<IDX_TYPE>(17)}};
+  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
+                                                                   perm);
 
   perm = std::array<RAJA::idx_t, 3> {{1, 2, 0}};
-  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns, perm);
+  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
+                                                                   perm);
 
   perm = std::array<RAJA::idx_t, 3> {{2, 0, 1}};
-  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns, perm);
+  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
+                                                                   perm);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopPermutedView3DTest,
diff --git a/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp b/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp
index e5b99159b8..b0958b0f52 100644
--- a/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp
+++ b/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp
@@ -10,20 +10,26 @@
 
 #include <numeric>
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_resource<EXEC_POL>( segs, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
+{
+  RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args) {
-  RAJA::kernel<EXEC_POL>( segs, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel<EXEC_POL>(segs, args...);
 }
 
 //
@@ -31,73 +37,88 @@ typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORK
 // Define list of nested loop types the Basic test supports.
 //
 //
-using BasicSupportedLoopTypeList = camp::list<
-  DEPTH_2,
-  DEPTH_2_COLLAPSE,
-  DEPTH_3,
-  DEPTH_3_COLLAPSE,
-  DEPTH_3_COLLAPSE_SEQ_INNER,
-  DEPTH_3_COLLAPSE_SEQ_OUTER,
-  DEVICE_DEPTH_2>;
+using BasicSupportedLoopTypeList = camp::list<DEPTH_2,
+                                              DEPTH_2_COLLAPSE,
+                                              DEPTH_3,
+                                              DEPTH_3_COLLAPSE,
+                                              DEPTH_3_COLLAPSE_SEQ_INNER,
+                                              DEPTH_3_COLLAPSE_SEQ_OUTER,
+                                              DEVICE_DEPTH_2>;
 
 //
 //
 // Basic 2D Matrix index calculation per element.
 //
 //
-template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE, typename... ExtraArgs>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          bool USE_RESOURCE,
+          typename... ExtraArgs>
 void KernelNestedLoopTest(const DEPTH_2&,
                           const RAJA::Index_type dim0,
                           const RAJA::Index_type dim1,
-                          ExtraArgs...){
-  WORKING_RES work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+                          ExtraArgs...)
+{
+  WORKING_RES work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type flatSize = dim0 * dim1;
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(flatSize,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(
+      flatSize, erased_work_res, &work_array, &check_array, &test_array);
 
-  RAJA::TypedRangeSegment<RAJA::Index_type> rangeflat(0,flatSize);
+  RAJA::TypedRangeSegment<RAJA::Index_type> rangeflat(0, flatSize);
   RAJA::TypedRangeSegment<RAJA::Index_type> range0(0, dim0);
   RAJA::TypedRangeSegment<RAJA::Index_type> range1(0, dim1);
 
   std::iota(test_array, test_array + RAJA::stripIndexType(flatSize), 0);
 
   constexpr int Depth = 2;
-  RAJA::View< RAJA::Index_type, RAJA::Layout<Depth> > work_view(work_array, dim1, dim0);
-
-  call_kernel<EXEC_POLICY, USE_RESOURCE>(RAJA::make_tuple(range1, range0), work_res,
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type j, RAJA::Index_type i) {
-                              work_view(j,i) = (j * dim0) + i;
-                            });
-
-  work_res.memcpy(check_array, work_array, sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
-  RAJA::forall<RAJA::seq_exec>(rangeflat, [=] (RAJA::Index_type i) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
-  });
-
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(work_array, dim1,
+                                                              dim0);
+
+  call_kernel<EXEC_POLICY, USE_RESOURCE>(
+      RAJA::make_tuple(range1, range0), work_res,
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type j, RAJA::Index_type i)
+      { work_view(j, i) = (j * dim0) + i; });
+
+  work_res.memcpy(check_array, work_array,
+                  sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
+  RAJA::forall<RAJA::seq_exec>(rangeflat,
+                               [=](RAJA::Index_type i)
+                               {
+                                 ASSERT_EQ(
+                                     test_array[RAJA::stripIndexType(i)],
+                                     check_array[RAJA::stripIndexType(i)]);
+                               });
+
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
-// DEPTH_2_COLLAPSE and DEVICE_DEPTH_2 execution policies use the above DEPTH_2 test.
-template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEPTH_2_COLLAPSE&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_2(), args...);
+// DEPTH_2_COLLAPSE and DEVICE_DEPTH_2 execution policies use the above DEPTH_2
+// test.
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEPTH_2_COLLAPSE&, Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_2(),
+                                                               args...);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEVICE_DEPTH_2&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_2(), args...);
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEVICE_DEPTH_2&, Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_2(),
+                                                               args...);
 }
 
 //
@@ -109,22 +130,20 @@ template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE>
 void KernelNestedLoopTest(const DEPTH_3&,
                           const RAJA::Index_type dim0,
                           const RAJA::Index_type dim1,
-                          const RAJA::Index_type dim2){
-  WORKING_RES work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+                          const RAJA::Index_type dim2)
+{
+  WORKING_RES work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type flatSize = dim0 * dim1 * dim2;
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(flatSize,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(
+      flatSize, erased_work_res, &work_array, &check_array, &test_array);
 
-  RAJA::TypedRangeSegment<RAJA::Index_type> rangeflat(0,flatSize);
+  RAJA::TypedRangeSegment<RAJA::Index_type> rangeflat(0, flatSize);
   RAJA::TypedRangeSegment<RAJA::Index_type> range0(0, dim0);
   RAJA::TypedRangeSegment<RAJA::Index_type> range1(0, dim1);
   RAJA::TypedRangeSegment<RAJA::Index_type> range2(0, dim2);
@@ -132,38 +151,58 @@ void KernelNestedLoopTest(const DEPTH_3&,
   std::iota(test_array, test_array + RAJA::stripIndexType(flatSize), 0);
 
   constexpr int Depth = 3;
-  RAJA::View< RAJA::Index_type, RAJA::Layout<Depth> > work_view(work_array, dim2, dim1, dim0);
-
-  call_kernel<EXEC_POLICY, USE_RESOURCE>(RAJA::make_tuple(range2, range1, range0), work_res,
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type k, RAJA::Index_type j, RAJA::Index_type i) {
-                              work_view(k,j,i) = (dim0 * dim1 * k) + (dim0 * j) + i;
-                            });
-
-  work_res.memcpy(check_array, work_array, sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
-  RAJA::forall<RAJA::seq_exec>(rangeflat, [=] (RAJA::Index_type i) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
-  });
-
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(work_array, dim2,
+                                                              dim1, dim0);
+
+  call_kernel<EXEC_POLICY, USE_RESOURCE>(
+      RAJA::make_tuple(range2, range1, range0), work_res,
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type k, RAJA::Index_type j,
+                           RAJA::Index_type i)
+      { work_view(k, j, i) = (dim0 * dim1 * k) + (dim0 * j) + i; });
+
+  work_res.memcpy(check_array, work_array,
+                  sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
+  RAJA::forall<RAJA::seq_exec>(rangeflat,
+                               [=](RAJA::Index_type i)
+                               {
+                                 ASSERT_EQ(
+                                     test_array[RAJA::stripIndexType(i)],
+                                     check_array[RAJA::stripIndexType(i)]);
+                               });
+
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
 // DEPTH_3_COLLAPSE execution policies use the above DEPTH_3 test.
-template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEPTH_3_COLLAPSE&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(), args...);
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEPTH_3_COLLAPSE&, Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(),
+                                                               args...);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEPTH_3_COLLAPSE_SEQ_OUTER&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(), args...);
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEPTH_3_COLLAPSE_SEQ_OUTER&, Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(),
+                                                               args...);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEPTH_3_COLLAPSE_SEQ_INNER&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(), args...);
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEPTH_3_COLLAPSE_SEQ_INNER&, Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(),
+                                                               args...);
 }
 
 //
@@ -171,97 +210,89 @@ void KernelNestedLoopTest(const DEPTH_3_COLLAPSE_SEQ_INNER&, Args... args){
 // Defining the Kernel Loop structure for Basic Nested Loop Tests.
 //
 //
-template<typename POLICY_TYPE, typename POLICY_DATA>
+template <typename POLICY_TYPE, typename POLICY_DATA>
 struct BasicNestedLoopExec;
 
-template<typename POLICY_DATA>
-struct BasicNestedLoopExec<DEPTH_3, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<2>>::type,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >
-    >;
+template <typename POLICY_DATA>
+struct BasicNestedLoopExec<DEPTH_3, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::For<
+      2,
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::statement::For<
+          1,
+          typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::statement::For<
+              0,
+              typename camp::at<POLICY_DATA, camp::num<2>>::type,
+              RAJA::statement::Lambda<0>>>>>;
 };
 
-template<typename POLICY_DATA>
-struct BasicNestedLoopExec<DEPTH_2, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::Lambda<0>
-        >
-      >
-    >;
+template <typename POLICY_DATA>
+struct BasicNestedLoopExec<DEPTH_2, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::For<
+      1,
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::statement::For<0,
+                           typename camp::at<POLICY_DATA, camp::num<1>>::type,
+                           RAJA::statement::Lambda<0>>>>;
 };
 
-template<typename POLICY_DATA>
-struct BasicNestedLoopExec<DEPTH_2_COLLAPSE, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::Collapse< typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::ArgList<1,0>,
-        RAJA::statement::Lambda<0>
-      >
-    >;
+template <typename POLICY_DATA>
+struct BasicNestedLoopExec<DEPTH_2_COLLAPSE, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::Collapse<
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::ArgList<1, 0>,
+      RAJA::statement::Lambda<0>>>;
 };
 
-template<typename POLICY_DATA>
-struct BasicNestedLoopExec<DEPTH_3_COLLAPSE, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::Collapse< typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::ArgList<0,1,2>,
-        RAJA::statement::Lambda<0>
-      >
-    >;
+template <typename POLICY_DATA>
+struct BasicNestedLoopExec<DEPTH_3_COLLAPSE, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::Collapse<
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::ArgList<0, 1, 2>,
+      RAJA::statement::Lambda<0>>>;
 };
 
-template<typename POLICY_DATA>
-struct BasicNestedLoopExec<DEPTH_3_COLLAPSE_SEQ_OUTER, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<0, RAJA::seq_exec,
-      RAJA::statement::Collapse< typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::ArgList<1,2>,
-          RAJA::statement::Lambda<0>
-        >
-      >
-    >;
+template <typename POLICY_DATA>
+struct BasicNestedLoopExec<DEPTH_3_COLLAPSE_SEQ_OUTER, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::For<
+      0,
+      RAJA::seq_exec,
+      RAJA::statement::Collapse<
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::ArgList<1, 2>,
+          RAJA::statement::Lambda<0>>>>;
 };
 
-template<typename POLICY_DATA>
-struct BasicNestedLoopExec<DEPTH_3_COLLAPSE_SEQ_INNER, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::Collapse< typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::ArgList<0,1>,
-        RAJA::statement::For<2, RAJA::seq_exec,
-          RAJA::statement::Lambda<0>
-        >
-      >
-    >;
+template <typename POLICY_DATA>
+struct BasicNestedLoopExec<DEPTH_3_COLLAPSE_SEQ_INNER, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::Collapse<
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::ArgList<0, 1>,
+      RAJA::statement::For<2, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
 };
 
-#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or defined(RAJA_ENABLE_SYCL)
-
-template<typename POLICY_DATA>
-struct BasicNestedLoopExec<DEVICE_DEPTH_2, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<0>>::type,  // row
-          RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<1>>::type,  // col
-            RAJA::statement::Lambda<0>
-          >
-        >
-      > // end CudaKernel
-    >;
+#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or                   \
+    defined(RAJA_ENABLE_SYCL)
+
+template <typename POLICY_DATA>
+struct BasicNestedLoopExec<DEVICE_DEPTH_2, POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          1,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,  // row
+          RAJA::statement::For<
+              0,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,  // col
+              RAJA::statement::Lambda<0>>>>  // end CudaKernel
+                         >;
 };
 
 #endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
diff --git a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp
index 5c2cdd5149..8f9d35df6f 100644
--- a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp
+++ b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp
@@ -10,20 +10,26 @@
 
 #include "RAJA_test-abs.hpp"
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_resource<EXEC_POL>( segs, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
+{
+  RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args) {
-  RAJA::kernel<EXEC_POL>( segs, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel<EXEC_POL>(segs, args...);
 }
 
 //
@@ -31,10 +37,8 @@ typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORK
 // Define list of nested loop types the MultiLambda test supports.
 //
 //
-using MultiLambdaSupportedLoopTypeList = camp::list<
-  DEPTH_2,
-  DEPTH_2_COLLAPSE,
-  DEVICE_DEPTH_2>;
+using MultiLambdaSupportedLoopTypeList =
+    camp::list<DEPTH_2, DEPTH_2_COLLAPSE, DEVICE_DEPTH_2>;
 
 //
 //
@@ -42,74 +46,97 @@ using MultiLambdaSupportedLoopTypeList = camp::list<
 //
 //
 template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE>
-void KernelNestedLoopTest(){
-  constexpr static int N = 1000;
+void KernelNestedLoopTest()
+{
+  constexpr static int N   = 1000;
   constexpr static int DIM = 2;
 
-  camp::resources::Resource host_res{camp::resources::Host()};
-  WORKING_RES work_res{WORKING_RES::get_default()};
+  camp::resources::Resource host_res {camp::resources::Host()};
+  WORKING_RES work_res {WORKING_RES::get_default()};
 
   // Allocate Tests Data
-  double* work_arrA = work_res.template allocate<double>(N*N);
-  double* work_arrB = work_res.template allocate<double>(N*N);
+  double* work_arrA = work_res.template allocate<double>(N * N);
+  double* work_arrB = work_res.template allocate<double>(N * N);
 
-  double* test_arrA = host_res.allocate<double>(N*N);
-  double* test_arrB = host_res.allocate<double>(N*N);
+  double* test_arrA = host_res.allocate<double>(N * N);
+  double* test_arrB = host_res.allocate<double>(N * N);
 
-  double* check_arrA = host_res.allocate<double>(N*N);
-  double* check_arrB = host_res.allocate<double>(N*N);
+  double* check_arrA = host_res.allocate<double>(N * N);
+  double* check_arrB = host_res.allocate<double>(N * N);
 
   // Initialize Data
-  for (RAJA::Index_type i = 0; i < N*N; i++) {
-    test_arrA[i] = i * 1.2;  test_arrB[i] = i * 0.5;
+  for (RAJA::Index_type i = 0; i < N * N; i++)
+  {
+    test_arrA[i] = i * 1.2;
+    test_arrB[i] = i * 0.5;
   }
-  work_res.memcpy(work_arrA, test_arrA, sizeof(double) * RAJA::stripIndexType(N*N));
-  work_res.memcpy(work_arrB, test_arrB, sizeof(double) * RAJA::stripIndexType(N*N));
+  work_res.memcpy(work_arrA, test_arrA,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(work_arrB, test_arrB,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
 
   // Initialize RAJA Views
-  RAJA::View< double, RAJA::Layout<DIM> > test_viewA(test_arrA, N, N);
-  RAJA::View< double, RAJA::Layout<DIM> > test_viewB(test_arrB, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> test_viewA(test_arrA, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> test_viewB(test_arrB, N, N);
 
   // Calculate Test data
-  for (RAJA::Index_type i = 1; i < N-1; ++i ) {
-    for (RAJA::Index_type j = 1; j < N-1; ++j ) {
-      test_viewB(i,j) = 0.2 * (test_viewA(i,j) + test_viewA(i,j-1) + test_viewA(i,j+1) + test_viewA(i+1,j) + test_viewA(i-1,j));
+  for (RAJA::Index_type i = 1; i < N - 1; ++i)
+  {
+    for (RAJA::Index_type j = 1; j < N - 1; ++j)
+    {
+      test_viewB(i, j) = 0.2 * (test_viewA(i, j) + test_viewA(i, j - 1) +
+                                test_viewA(i, j + 1) + test_viewA(i + 1, j) +
+                                test_viewA(i - 1, j));
     }
   }
-  for (RAJA::Index_type i = 1; i < N-1; ++i ) {
-    for (RAJA::Index_type j = 1; j < N-1; ++j ) {
-      test_viewA(i,j) = 0.2 * (test_viewB(i,j) + test_viewB(i,j-1) + test_viewB(i,j+1) + test_viewB(i+1,j) + test_viewB(i-1,j));
+  for (RAJA::Index_type i = 1; i < N - 1; ++i)
+  {
+    for (RAJA::Index_type j = 1; j < N - 1; ++j)
+    {
+      test_viewA(i, j) = 0.2 * (test_viewB(i, j) + test_viewB(i, j - 1) +
+                                test_viewB(i, j + 1) + test_viewB(i + 1, j) +
+                                test_viewB(i - 1, j));
     }
-  } 
+  }
 
-  RAJA::View< double, RAJA::Layout<DIM> > work_viewA(work_arrA, N, N);
-  RAJA::View< double, RAJA::Layout<DIM> > work_viewB(work_arrB, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> work_viewA(work_arrA, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> work_viewB(work_arrB, N, N);
 
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
-    RAJA::make_tuple(RAJA::RangeSegment{1, N-1},
-                     RAJA::RangeSegment{1, N-1}),
-
-    // Resource
-    work_res,
-
-    // lambda 0
-    [=] RAJA_HOST_DEVICE (RAJA::Index_type i, RAJA::Index_type j) {
-      work_viewB(i,j) = 0.2 * (work_viewA(i,j) + work_viewA(i,j-1) + work_viewA(i,j+1) + work_viewA(i+1,j) + work_viewA(i-1,j));
-    },
-
-    // lambda 1
-    [=] RAJA_HOST_DEVICE (RAJA::Index_type i, RAJA::Index_type j) {
-      work_viewA(i,j) = 0.2 * (work_viewB(i,j) + work_viewB(i,j-1) + work_viewB(i,j+1) + work_viewB(i+1,j) + work_viewB(i-1,j));
-    }
-  );
-
-  work_res.memcpy(check_arrA, work_arrA, sizeof(double) * RAJA::stripIndexType(N*N));
-  work_res.memcpy(check_arrB, work_arrB, sizeof(double) * RAJA::stripIndexType(N*N));
-
-  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment{0, N*N}, [=] (RAJA::Index_type i) {
-    ASSERT_TRUE( RAJA::test_abs(test_arrA[i] - check_arrA[i]) < 10e-8 );
-    ASSERT_TRUE( RAJA::test_abs(test_arrB[i] - check_arrB[i]) < 10e-8 );
-  });
+      RAJA::make_tuple(RAJA::RangeSegment {1, N - 1},
+                       RAJA::RangeSegment {1, N - 1}),
+
+      // Resource
+      work_res,
+
+      // lambda 0
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j)
+      {
+        work_viewB(i, j) = 0.2 * (work_viewA(i, j) + work_viewA(i, j - 1) +
+                                  work_viewA(i, j + 1) + work_viewA(i + 1, j) +
+                                  work_viewA(i - 1, j));
+      },
+
+      // lambda 1
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j)
+      {
+        work_viewA(i, j) = 0.2 * (work_viewB(i, j) + work_viewB(i, j - 1) +
+                                  work_viewB(i, j + 1) + work_viewB(i + 1, j) +
+                                  work_viewB(i - 1, j));
+      });
+
+  work_res.memcpy(check_arrA, work_arrA,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(check_arrB, work_arrB,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
+
+  RAJA::forall<RAJA::seq_exec>(
+      RAJA::RangeSegment {0, N * N},
+      [=](RAJA::Index_type i)
+      {
+        ASSERT_TRUE(RAJA::test_abs(test_arrA[i] - check_arrA[i]) < 10e-8);
+        ASSERT_TRUE(RAJA::test_abs(test_arrB[i] - check_arrB[i]) < 10e-8);
+      });
 
   work_res.deallocate(work_arrA);
   work_res.deallocate(work_arrB);
@@ -126,62 +153,64 @@ void KernelNestedLoopTest(){
 // Defining the Kernel Loop structure for MultiLambda Nested Loop Tests.
 //
 //
-template<typename POLICY_TYPE, typename POLICY_DATA>
+template <typename POLICY_TYPE, typename POLICY_DATA>
 struct MultiLambdaNestedLoopExec;
 
-template<typename POLICY_DATA>
-struct MultiLambdaNestedLoopExec<DEPTH_2, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::Lambda<0>
-        >
-      >,
-      RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::Lambda<1>
-        >
-      >
-    >;
+template <typename POLICY_DATA>
+struct MultiLambdaNestedLoopExec<DEPTH_2, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<
+      RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::Lambda<0>>>,
+      RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::Lambda<1>>>>;
 };
 
-template<typename POLICY_DATA>
-struct MultiLambdaNestedLoopExec<DEPTH_2_COLLAPSE, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::Collapse< typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::ArgList<1,0>,
-        RAJA::statement::Lambda<0>
-      >,
-      RAJA::statement::Collapse< typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::ArgList<1,0>,
-        RAJA::statement::Lambda<1>
-      >
-    >;
+template <typename POLICY_DATA>
+struct MultiLambdaNestedLoopExec<DEPTH_2_COLLAPSE, POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::Collapse<
+                             typename camp::at<POLICY_DATA, camp::num<0>>::type,
+                             RAJA::ArgList<1, 0>,
+                             RAJA::statement::Lambda<0>>,
+                         RAJA::statement::Collapse<
+                             typename camp::at<POLICY_DATA, camp::num<0>>::type,
+                             RAJA::ArgList<1, 0>,
+                             RAJA::statement::Lambda<1>>>;
 };
 
-#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or defined(RAJA_ENABLE_SYCL)
-
-template<typename POLICY_DATA>
-struct MultiLambdaNestedLoopExec<DEVICE_DEPTH_2, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >,
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-            RAJA::statement::Lambda<1>
-          >
-        >
-      >
-    >;
+#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or                   \
+    defined(RAJA_ENABLE_SYCL)
+
+template <typename POLICY_DATA>
+struct MultiLambdaNestedLoopExec<DEVICE_DEPTH_2, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::Lambda<0>>>>,
+      RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::Lambda<1>>>>>;
 };
 
 #endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
diff --git a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
index 8c62b908e3..bcf51593d6 100644
--- a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
+++ b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
@@ -10,22 +10,30 @@
 
 #include "RAJA_test-abs.hpp"
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_param_resource<EXEC_POL>( segs, params, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type call_kernel(SEGMENTS&& segs,
+                                                        PARAMS&& params,
+                                                        WORKING_RES work_res,
+                                                        Args&&... args)
+{
+  RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args) {
-  RAJA::kernel_param<EXEC_POL>( segs, params, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel_param<EXEC_POL>(segs, params, args...);
 }
 
 //
@@ -33,10 +41,8 @@ typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, PARA
 // Define list of nested loop types the MultiLambdaParam test supports.
 //
 //
-using MultiLambdaParamSupportedLoopTypeList = camp::list<
-  DEPTH_3,
-  DEVICE_DEPTH_3
-  >;
+using MultiLambdaParamSupportedLoopTypeList =
+    camp::list<DEPTH_3, DEVICE_DEPTH_3>;
 
 //
 //
@@ -44,93 +50,97 @@ using MultiLambdaParamSupportedLoopTypeList = camp::list<
 //
 //
 template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE>
-void KernelNestedLoopTest(){
+void KernelNestedLoopTest()
+{
 
-  constexpr static int N = 100;
+  constexpr static int N   = 100;
   constexpr static int DIM = 2;
 
-  camp::resources::Resource host_res{camp::resources::Host()};
-  WORKING_RES work_res{WORKING_RES::get_default()};
+  camp::resources::Resource host_res {camp::resources::Host()};
+  WORKING_RES work_res {WORKING_RES::get_default()};
 
   // Allocate Tests Data
-  double* work_arrA = work_res.template allocate<double>(N*N);
-  double* work_arrB = work_res.template allocate<double>(N*N);
-  double* work_arrC = work_res.template allocate<double>(N*N);
+  double* work_arrA = work_res.template allocate<double>(N * N);
+  double* work_arrB = work_res.template allocate<double>(N * N);
+  double* work_arrC = work_res.template allocate<double>(N * N);
 
-  double* test_arrA = host_res.allocate<double>(N*N);
-  double* test_arrB = host_res.allocate<double>(N*N);
-  double* test_arrC = host_res.allocate<double>(N*N);
+  double* test_arrA = host_res.allocate<double>(N * N);
+  double* test_arrB = host_res.allocate<double>(N * N);
+  double* test_arrC = host_res.allocate<double>(N * N);
 
-  double* check_arrC = host_res.allocate<double>(N*N);
+  double* check_arrC = host_res.allocate<double>(N * N);
 
   // Initialize RAJA Views
-  RAJA::View< double, RAJA::Layout<DIM> > test_viewA(test_arrA, N, N);
-  RAJA::View< double, RAJA::Layout<DIM> > test_viewB(test_arrB, N, N);
-  RAJA::View< double, RAJA::Layout<DIM> > test_viewC(test_arrC, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> test_viewA(test_arrA, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> test_viewB(test_arrB, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> test_viewC(test_arrC, N, N);
 
-  RAJA::View< double, RAJA::Layout<DIM> > work_viewA(work_arrA, N, N);
-  RAJA::View< double, RAJA::Layout<DIM> > work_viewB(work_arrB, N, N);
-  RAJA::View< double, RAJA::Layout<DIM> > work_viewC(work_arrC, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> work_viewA(work_arrA, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> work_viewB(work_arrB, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> work_viewC(work_arrC, N, N);
 
   // Initialize Data
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < N; ++col) {
+  for (int row = 0; row < N; ++row)
+  {
+    for (int col = 0; col < N; ++col)
+    {
       test_viewA(row, col) = row;
       test_viewB(row, col) = col;
       test_viewB(row, col) = 0;
     }
   }
 
-  work_res.memcpy(work_arrA, test_arrA, sizeof(double) * RAJA::stripIndexType(N*N));
-  work_res.memcpy(work_arrB, test_arrB, sizeof(double) * RAJA::stripIndexType(N*N));
-  work_res.memcpy(work_arrC, test_arrC, sizeof(double) * RAJA::stripIndexType(N*N));
+  work_res.memcpy(work_arrA, test_arrA,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(work_arrB, test_arrB,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(work_arrC, test_arrC,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
 
   // Calculate Test data
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < N; ++col) {
+  for (int row = 0; row < N; ++row)
+  {
+    for (int col = 0; col < N; ++col)
+    {
 
       double dot = 0.0;
-      for (int k = 0; k < N; ++k) {
+      for (int k = 0; k < N; ++k)
+      {
         dot += test_viewA(row, k) * test_viewB(k, col);
       }
       test_viewC(row, col) = dot;
-
     }
   }
-  
+
   // Calculate Working data
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
-    RAJA::make_tuple(RAJA::RangeSegment{0, N},
-                     RAJA::RangeSegment{0, N},
-                     RAJA::RangeSegment{0, N}),
+      RAJA::make_tuple(RAJA::RangeSegment {0, N}, RAJA::RangeSegment {0, N},
+                       RAJA::RangeSegment {0, N}),
 
-    RAJA::tuple<double>{0.0},
+      RAJA::tuple<double> {0.0},
 
-    // Resource
-    work_res,
+      // Resource
+      work_res,
 
-    // lambda 0
-    [=] RAJA_HOST_DEVICE (double& dot) {
-       dot = 0.0;
-    },
+      // lambda 0
+      [=] RAJA_HOST_DEVICE(double& dot) { dot = 0.0; },
 
-    // lambda 1
-    [=] RAJA_HOST_DEVICE (int col, int row, int k, double& dot) {
-       dot += work_viewA(row, k) * work_viewB(k, col);
-    },
+      // lambda 1
+      [=] RAJA_HOST_DEVICE(int col, int row, int k, double& dot)
+      { dot += work_viewA(row, k) * work_viewB(k, col); },
 
-    // lambda 2
-    [=] RAJA_HOST_DEVICE (int col, int row, double& dot) {
-       work_viewC(row, col) = dot;
-    }
+      // lambda 2
+      [=] RAJA_HOST_DEVICE(int col, int row, double& dot)
+      { work_viewC(row, col) = dot; }
 
   );
 
-  work_res.memcpy(check_arrC, work_arrC, sizeof(double) * RAJA::stripIndexType(N*N));
+  work_res.memcpy(check_arrC, work_arrC,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
 
-  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment{0, N*N}, [=] (RAJA::Index_type i) {
-    ASSERT_TRUE( RAJA::test_abs(test_arrC[i] - check_arrC[i]) < 10e-8 );
-  });
+  RAJA::forall<RAJA::seq_exec>(
+      RAJA::RangeSegment {0, N * N}, [=](RAJA::Index_type i)
+      { ASSERT_TRUE(RAJA::test_abs(test_arrC[i] - check_arrC[i]) < 10e-8); });
 
   work_res.deallocate(work_arrA);
   work_res.deallocate(work_arrB);
@@ -148,43 +158,57 @@ void KernelNestedLoopTest(){
 // Defining the Kernel Loop structure for MultiLambdaParam Nested Loop Tests.
 //
 //
-template<typename POLICY_TYPE, typename POLICY_DATA>
+template <typename POLICY_TYPE, typename POLICY_DATA>
 struct MultiLambdaParamNestedLoopExec;
 
-template<typename POLICY_DATA>
-struct MultiLambdaParamNestedLoopExec<DEPTH_3, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<1>>::type,
+template <typename POLICY_DATA>
+struct MultiLambdaParamNestedLoopExec<DEPTH_3, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::For<
+      1,
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<1>>::type,
           RAJA::statement::Lambda<0, RAJA::Params<0>>,  // dot = 0.0
-          RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
-            RAJA::statement::Lambda<1> // inner loop: dot += ...
-          >,
-          RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>   // set C(row, col) = dot
-        >
-      >
-    >;
+          RAJA::statement::For<
+              2,
+              typename camp::at<POLICY_DATA, camp::num<2>>::type,
+              RAJA::statement::Lambda<1>  // inner loop: dot += ...
+              >,
+          RAJA::statement::
+              Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>  // set
+                                                            // C(row,
+                                                            // col)
+                                                            // =
+                                                            // dot
+          >>>;
 };
 
-#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or defined(RAJA_ENABLE_SYCL)
-
-template<typename POLICY_DATA>
-struct MultiLambdaParamNestedLoopExec<DEVICE_DEPTH_3, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-      RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::Lambda<0, RAJA::Params<0>>,  // dot = 0.0
-          RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
-            RAJA::statement::Lambda<1> // inner loop: dot += ...
-          >,
-          RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>   // set C(row, col) = dot
-        >
-      >
-      > // end CudaKernel
-    >;
+#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or                   \
+    defined(RAJA_ENABLE_SYCL)
+
+template <typename POLICY_DATA>
+struct MultiLambdaParamNestedLoopExec<DEVICE_DEPTH_3, POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          1,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              0,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::Lambda<0, RAJA::Params<0>>,  // dot = 0.0
+              RAJA::statement::For<
+                  2,
+                  typename camp::at<POLICY_DATA, camp::num<2>>::type,
+                  RAJA::statement::Lambda<1>  // inner loop: dot += ...
+                  >,
+              RAJA::statement::
+                  Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>  // set C(row,
+                                                                // col) = dot
+              >>>  // end CudaKernel
+                         >;
 };
 
 #endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp
index 37cab1789b..27ed5270bc 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp
@@ -17,10 +17,12 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest);
 template <typename T>
-class KernelNestedLoopBasicTest : public ::testing::Test {};
+class KernelNestedLoopBasicTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -35,11 +37,11 @@ TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel) {
   constexpr bool USE_RES = false;
 
   // For double nested loop tests the third arg is ignored.
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>( LOOP_TYPE(), 1,1,1);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>( LOOP_TYPE(), 40,30,20);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(LOOP_TYPE(), 1, 1, 1);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(LOOP_TYPE(), 40, 30,
+                                                          20);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest,
-                            NestedLoopBasicKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel);
 
 #endif  // __TEST_KERNEL_NESTED_LOOP_BASIC_HPP__
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambda.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambda.hpp
index cddcb005f4..a1ce6bbbd2 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambda.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambda.hpp
@@ -17,10 +17,12 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopMultiLambdaTest);
 template <typename T>
-class KernelNestedLoopMultiLambdaTest : public ::testing::Test {};
+class KernelNestedLoopMultiLambdaTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -30,7 +32,8 @@ TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename MultiLambdaNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename MultiLambdaNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
 
   constexpr bool USE_RES = false;
 
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambdaParam.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambdaParam.hpp
index eae84e88c9..df5264ec43 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambdaParam.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambdaParam.hpp
@@ -17,10 +17,13 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopMultiLambdaParamTest);
 template <typename T>
-class KernelNestedLoopMultiLambdaParamTest : public ::testing::Test {};
+class KernelNestedLoopMultiLambdaParamTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest, NestedLoopMultiLambdaParamKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest,
+             NestedLoopMultiLambdaParamKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -30,7 +33,8 @@ TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest, NestedLoopMultiLambdaParamKer
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename MultiLambdaParamNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename MultiLambdaParamNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
 
   constexpr bool USE_RES = false;
 
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp
index 7845500ae7..1fbec4bb91 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp
@@ -16,10 +16,12 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest);
 template <typename T>
-class KernelNestedLoopBasicTest : public ::testing::Test {};
+class KernelNestedLoopBasicTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -34,11 +36,11 @@ TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel) {
   constexpr bool USE_RES = true;
 
   // For double nested loop tests the third arg is ignored.
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>( LOOP_TYPE(), 1,1,1);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>( LOOP_TYPE(), 40,30,20);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(LOOP_TYPE(), 1, 1, 1);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(LOOP_TYPE(), 40, 30,
+                                                          20);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest,
-                            NestedLoopBasicKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel);
 
 #endif  // __TEST_KERNEL_NESTED_LOOP_RESOURCE_BASIC_HPP__
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambda.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambda.hpp
index 75616bea68..798faf3f99 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambda.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambda.hpp
@@ -17,10 +17,12 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopMultiLambdaTest);
 template <typename T>
-class KernelNestedLoopMultiLambdaTest : public ::testing::Test {};
+class KernelNestedLoopMultiLambdaTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -30,7 +32,8 @@ TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename MultiLambdaNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename MultiLambdaNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
 
   constexpr bool USE_RES = true;
 
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambdaParam.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambdaParam.hpp
index 02dbe213cc..31dfbc1bd8 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambdaParam.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambdaParam.hpp
@@ -17,10 +17,13 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopMultiLambdaParamTest);
 template <typename T>
-class KernelNestedLoopMultiLambdaParamTest : public ::testing::Test {};
+class KernelNestedLoopMultiLambdaParamTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest, NestedLoopMultiLambdaParamKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest,
+             NestedLoopMultiLambdaParamKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -30,7 +33,8 @@ TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest, NestedLoopMultiLambdaParamKer
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename MultiLambdaParamNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename MultiLambdaParamNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
 
   constexpr bool USE_RES = true;
 
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp
index f2f2d0acab..0e416b44a5 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp
@@ -8,123 +8,116 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MAX2D_HPP__
 #define __TEST_KERNEL_REDUCELOC_MAX2D_HPP__
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename FORALL_POLICY, typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename FORALL_POLICY,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelLocMax2DTestImpl(const int xdim, const int ydim)
 {
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
-  DATA_TYPE ** workarr2D;
-  DATA_TYPE ** checkarr2D;
-  DATA_TYPE ** testarr2D;
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE** workarr2D;
+  DATA_TYPE** checkarr2D;
+  DATA_TYPE** testarr2D;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE *> ( ydim,
-                                        work_res,
-                                        &workarr2D,
-                                        &checkarr2D,
-                                        &testarr2D
-                                      );
+  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
+                                     &testarr2D);
 
   // set rows to point to check and work _arrays
-  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0,ydim);
+  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
   RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-  {
-    workarr2D[zz] = work_array + zz * ydim;
-  });
+                              { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    checkarr2D[zz] = check_array + zz * ydim;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
+                               { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    for ( int xx = 0; xx < xdim; ++xx )
-    {
-      checkarr2D[zz][xx] = zz*xdim + xx;
-    }
-    checkarr2D[ydim-1][xdim-1] = 0;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg,
+                               [=](INDEX_TYPE zz)
+                               {
+                                 for (int xx = 0; xx < xdim; ++xx)
+                                 {
+                                   checkarr2D[zz][xx] = zz * xdim + xx;
+                                 }
+                                 checkarr2D[ydim - 1][xdim - 1] = 0;
+                               });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
   RAJA::TypedRangeSegment<INDEX_TYPE> colrange(0, xdim);
   RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, ydim);
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, Index2D> maxloc_reducer((DATA_TYPE)0, Index2D(0, 0));
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, Index2D> maxloc_reducer(
+      (DATA_TYPE)0, Index2D(0, 0));
 
-  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                           [=] RAJA_HOST_DEVICE (int c, int r) {
-                             maxloc_reducer.maxloc(workarr2D[r][c], Index2D(c, r));
-                           });
+  RAJA::kernel<EXEC_POLICY>(
+      RAJA::make_tuple(colrange, rowrange), [=] RAJA_HOST_DEVICE(int c, int r)
+      { maxloc_reducer.maxloc(workarr2D[r][c], Index2D(c, r)); });
 
   // CPU answer
-  RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer((DATA_TYPE)0, Index2D(0, 0));
-
-  RAJA::forall<RAJA::seq_exec>(colrange, [=] (INDEX_TYPE c) {
-    for( int r = 0; r < ydim; ++r)
-    {
-      checkmaxloc_reducer.maxloc(checkarr2D[r][c], Index2D(c, r));
-    }
-  });
-
-  Index2D raja_loc = maxloc_reducer.getLoc();
-  DATA_TYPE raja_max = (DATA_TYPE)maxloc_reducer.get();
-  Index2D checkraja_loc = checkmaxloc_reducer.getLoc();
+  RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer(
+      (DATA_TYPE)0, Index2D(0, 0));
+
+  RAJA::forall<RAJA::seq_exec>(colrange,
+                               [=](INDEX_TYPE c)
+                               {
+                                 for (int r = 0; r < ydim; ++r)
+                                 {
+                                   checkmaxloc_reducer.maxloc(checkarr2D[r][c],
+                                                              Index2D(c, r));
+                                 }
+                               });
+
+  Index2D raja_loc        = maxloc_reducer.getLoc();
+  DATA_TYPE raja_max      = (DATA_TYPE)maxloc_reducer.get();
+  Index2D checkraja_loc   = checkmaxloc_reducer.getLoc();
   DATA_TYPE checkraja_max = (DATA_TYPE)checkmaxloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_max, (DATA_TYPE)raja_max);
   ASSERT_EQ(checkraja_loc.idx, raja_loc.idx);
   ASSERT_EQ(checkraja_loc.idy, raja_loc.idy);
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
-
-  deallocateForallTestData<DATA_TYPE *> ( work_res,
-                                          workarr2D,
-                                          checkarr2D,
-                                          testarr2D
-                                        );
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
+
+  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
+                                       testarr2D);
 }
 
 
 TYPED_TEST_SUITE_P(KernelLocMax2DTest);
 template <typename T>
 class KernelLocMax2DTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelLocMax2DTest, LocMax2DKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(151, 151);
-  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                         EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                         EXEC_POLICY, REDUCE_POLICY>(151, 151);
+  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                         EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelLocMax2DTest,
-                            LocMax2DKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelLocMax2DTest, LocMax2DKernel);
 
 #endif  // __TEST_KERNEL_REDUCELOC_MAX2D_HPP__
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp
index bd648ff88c..938f0b666f 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp
@@ -8,56 +8,50 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MAX2DVIEW_HPP__
 #define __TEST_KERNEL_REDUCELOC_MAX2DVIEW_HPP__
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename FORALL_POLICY, typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename FORALL_POLICY,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelLocMax2DViewTestImpl(const int xdim, const int ydim)
 {
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
-  DATA_TYPE ** workarr2D;
-  DATA_TYPE ** checkarr2D;
-  DATA_TYPE ** testarr2D;
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE** workarr2D;
+  DATA_TYPE** checkarr2D;
+  DATA_TYPE** testarr2D;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE *> ( ydim,
-                                        work_res,
-                                        &workarr2D,
-                                        &checkarr2D,
-                                        &testarr2D
-                                      );
+  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
+                                     &testarr2D);
 
   // set rows to point to check and work _arrays
-  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0,ydim);
+  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
   RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-  {
-    workarr2D[zz] = work_array + zz * ydim;
-  });
+                              { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    checkarr2D[zz] = check_array + zz * ydim;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
+                               { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    for ( int xx = 0; xx < xdim; ++xx )
-    {
-      checkarr2D[zz][xx] = zz*xdim + xx;
-    }
-    checkarr2D[ydim-1][xdim-1] = 0;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg,
+                               [=](INDEX_TYPE zz)
+                               {
+                                 for (int xx = 0; xx < xdim; ++xx)
+                                 {
+                                   checkarr2D[zz][xx] = zz * xdim + xx;
+                                 }
+                                 checkarr2D[ydim - 1][xdim - 1] = 0;
+                               });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
@@ -66,67 +60,66 @@ void KernelLocMax2DViewTestImpl(const int xdim, const int ydim)
 
   RAJA::View<DATA_TYPE, RAJA::Layout<2>> ArrView(work_array, xdim, ydim);
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, Index2D> maxloc_reducer((DATA_TYPE)0, Index2D(0, 0));
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, Index2D> maxloc_reducer(
+      (DATA_TYPE)0, Index2D(0, 0));
 
-  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                           [=] RAJA_HOST_DEVICE (int c, int r) {
-                             maxloc_reducer.maxloc(ArrView(r, c), Index2D(c, r));
-                           });
+  RAJA::kernel<EXEC_POLICY>(
+      RAJA::make_tuple(colrange, rowrange), [=] RAJA_HOST_DEVICE(int c, int r)
+      { maxloc_reducer.maxloc(ArrView(r, c), Index2D(c, r)); });
 
   // CPU answer
-  RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer((DATA_TYPE)0, Index2D(0, 0));
-
-  RAJA::forall<RAJA::seq_exec>(colrange, [=] (INDEX_TYPE c) {
-    for( int r = 0; r < ydim; ++r)
-    {
-      checkmaxloc_reducer.maxloc(checkarr2D[r][c], Index2D(c, r));
-    }
-  });
-
-  Index2D raja_loc = maxloc_reducer.getLoc();
-  DATA_TYPE raja_max = (DATA_TYPE)maxloc_reducer.get();
-  Index2D checkraja_loc = checkmaxloc_reducer.getLoc();
+  RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer(
+      (DATA_TYPE)0, Index2D(0, 0));
+
+  RAJA::forall<RAJA::seq_exec>(colrange,
+                               [=](INDEX_TYPE c)
+                               {
+                                 for (int r = 0; r < ydim; ++r)
+                                 {
+                                   checkmaxloc_reducer.maxloc(checkarr2D[r][c],
+                                                              Index2D(c, r));
+                                 }
+                               });
+
+  Index2D raja_loc        = maxloc_reducer.getLoc();
+  DATA_TYPE raja_max      = (DATA_TYPE)maxloc_reducer.get();
+  Index2D checkraja_loc   = checkmaxloc_reducer.getLoc();
   DATA_TYPE checkraja_max = (DATA_TYPE)checkmaxloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_max, (DATA_TYPE)raja_max);
   ASSERT_EQ(checkraja_loc.idx, raja_loc.idx);
   ASSERT_EQ(checkraja_loc.idy, raja_loc.idy);
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
-
-  deallocateForallTestData<DATA_TYPE *> ( work_res,
-                                          workarr2D,
-                                          checkarr2D,
-                                          testarr2D
-                                        );
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
+
+  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
+                                       testarr2D);
 }
 
 
 TYPED_TEST_SUITE_P(KernelLocMax2DViewTest);
 template <typename T>
 class KernelLocMax2DViewTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelLocMax2DViewTest, LocMax2DViewKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(151, 151);
-  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                             EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                             EXEC_POLICY, REDUCE_POLICY>(151, 151);
+  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                             EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelLocMax2DViewTest,
-                            LocMax2DViewKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelLocMax2DViewTest, LocMax2DViewKernel);
 
 #endif  // __TEST_KERNEL_REDUCELOC_MAX2DVIEW_HPP__
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp
index 045fc8e97e..f4dd3648a0 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp
@@ -8,61 +8,55 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MAX2DVIEWTUPLE_HPP__
 #define __TEST_KERNEL_REDUCELOC_MAX2DVIEWTUPLE_HPP__
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename FORALL_POLICY, typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename FORALL_POLICY,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelLocMax2DViewTupleTestImpl(const int xdim, const int ydim)
 {
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
-  DATA_TYPE ** workarr2D;
-  DATA_TYPE ** checkarr2D;
-  DATA_TYPE ** testarr2D;
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE** workarr2D;
+  DATA_TYPE** checkarr2D;
+  DATA_TYPE** testarr2D;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE *> ( ydim,
-                                        work_res,
-                                        &workarr2D,
-                                        &checkarr2D,
-                                        &testarr2D
-                                      );
+  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
+                                     &testarr2D);
 
   // set rows to point to check and work _arrays
-  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0,ydim);
+  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
   RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-  {
-    workarr2D[zz] = work_array + zz * ydim;
-  });
+                              { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    checkarr2D[zz] = check_array + zz * ydim;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
+                               { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    for ( int xx = 0; xx < xdim; ++xx )
-    {
-      checkarr2D[zz][xx] = zz*xdim + xx;
-    }
-    checkarr2D[ydim-1][xdim-1] = 0;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg,
+                               [=](INDEX_TYPE zz)
+                               {
+                                 for (int xx = 0; xx < xdim; ++xx)
+                                 {
+                                   checkarr2D[zz][xx] = zz * xdim + xx;
+                                 }
+                                 checkarr2D[ydim - 1][xdim - 1] = 0;
+                               });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-  //#pragma omp target data map(to:work_array[0:array_length])
+  // #pragma omp target data map(to:work_array[0:array_length])
 #endif
 
   RAJA::TypedRangeSegment<INDEX_TYPE> colrange(0, xdim);
@@ -72,64 +66,72 @@ void KernelLocMax2DViewTupleTestImpl(const int xdim, const int ydim)
 
   RAJA::tuple<DATA_TYPE, DATA_TYPE> LocTup(0, 0);
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, RAJA::tuple<DATA_TYPE, DATA_TYPE>> maxloc_reducer((DATA_TYPE)0, LocTup);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE,
+                     RAJA::tuple<DATA_TYPE, DATA_TYPE>>
+      maxloc_reducer((DATA_TYPE)0, LocTup);
 
   RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                           [=] RAJA_HOST_DEVICE (int c, int r) {
-                             maxloc_reducer.maxloc(ArrView(r, c), RAJA::make_tuple((DATA_TYPE)c, (DATA_TYPE)r));
-                           });
+                            [=] RAJA_HOST_DEVICE(int c, int r)
+                            {
+                              maxloc_reducer.maxloc(
+                                  ArrView(r, c),
+                                  RAJA::make_tuple((DATA_TYPE)c, (DATA_TYPE)r));
+                            });
 
   // CPU answer
-  RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer((DATA_TYPE)0, Index2D(0, 0));
-
-  RAJA::forall<RAJA::seq_exec>(colrange, [=] (INDEX_TYPE c) {
-    for( int r = 0; r < ydim; ++r)
-    {
-      checkmaxloc_reducer.maxloc(checkarr2D[r][c], Index2D(c, r));
-    }
-  });
+  RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer(
+      (DATA_TYPE)0, Index2D(0, 0));
+
+  RAJA::forall<RAJA::seq_exec>(colrange,
+                               [=](INDEX_TYPE c)
+                               {
+                                 for (int r = 0; r < ydim; ++r)
+                                 {
+                                   checkmaxloc_reducer.maxloc(checkarr2D[r][c],
+                                                              Index2D(c, r));
+                                 }
+                               });
 
   RAJA::tuple<DATA_TYPE, DATA_TYPE> raja_loc = maxloc_reducer.getLoc();
-  DATA_TYPE raja_max = (DATA_TYPE)maxloc_reducer.get();
-  Index2D checkraja_loc = checkmaxloc_reducer.getLoc();
+  DATA_TYPE raja_max                         = (DATA_TYPE)maxloc_reducer.get();
+  Index2D checkraja_loc                      = checkmaxloc_reducer.getLoc();
   DATA_TYPE checkraja_max = (DATA_TYPE)checkmaxloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_max, (DATA_TYPE)raja_max);
   ASSERT_EQ(checkraja_loc.idx, RAJA::get<0>(raja_loc));
   ASSERT_EQ(checkraja_loc.idy, RAJA::get<1>(raja_loc));
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
-
-  deallocateForallTestData<DATA_TYPE *> ( work_res,
-                                          workarr2D,
-                                          checkarr2D,
-                                          testarr2D
-                                        );
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
+
+  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
+                                       testarr2D);
 }
 
 
 TYPED_TEST_SUITE_P(KernelLocMax2DViewTupleTest);
 template <typename T>
 class KernelLocMax2DViewTupleTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelLocMax2DViewTupleTest, LocMax2DViewTupleKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(151, 151);
-  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
+      10, 10);
+  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
+      151, 151);
+  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
+      362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelLocMax2DViewTupleTest,
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp
index 090280813c..165adf2284 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp
@@ -8,123 +8,116 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MIN2D_HPP__
 #define __TEST_KERNEL_REDUCELOC_MIN2D_HPP__
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename FORALL_POLICY, typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename FORALL_POLICY,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelLocMin2DTestImpl(const int xdim, const int ydim)
 {
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
-  DATA_TYPE ** workarr2D;
-  DATA_TYPE ** checkarr2D;
-  DATA_TYPE ** testarr2D;
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE** workarr2D;
+  DATA_TYPE** checkarr2D;
+  DATA_TYPE** testarr2D;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE *> ( ydim,
-                                        work_res,
-                                        &workarr2D,
-                                        &checkarr2D,
-                                        &testarr2D
-                                      );
+  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
+                                     &testarr2D);
 
   // set rows to point to check and work _arrays
-  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0,ydim);
+  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
   RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-  {
-    workarr2D[zz] = work_array + zz * ydim;
-  });
+                              { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    checkarr2D[zz] = check_array + zz * ydim;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
+                               { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    for ( int xx = 0; xx < xdim; ++xx )
-    {
-      checkarr2D[zz][xx] = zz*xdim + xx + 1;
-    }
-    checkarr2D[ydim-1][xdim-1] = 0;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg,
+                               [=](INDEX_TYPE zz)
+                               {
+                                 for (int xx = 0; xx < xdim; ++xx)
+                                 {
+                                   checkarr2D[zz][xx] = zz * xdim + xx + 1;
+                                 }
+                                 checkarr2D[ydim - 1][xdim - 1] = 0;
+                               });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
   RAJA::TypedRangeSegment<INDEX_TYPE> colrange(0, xdim);
   RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, ydim);
 
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, Index2D> minloc_reducer((DATA_TYPE)1024, Index2D(0, 0));
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, Index2D> minloc_reducer(
+      (DATA_TYPE)1024, Index2D(0, 0));
 
-  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                           [=] RAJA_HOST_DEVICE (int c, int r) {
-                             minloc_reducer.minloc(workarr2D[r][c], Index2D(c, r));
-                           });
+  RAJA::kernel<EXEC_POLICY>(
+      RAJA::make_tuple(colrange, rowrange), [=] RAJA_HOST_DEVICE(int c, int r)
+      { minloc_reducer.minloc(workarr2D[r][c], Index2D(c, r)); });
 
   // CPU answer
-  RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer((DATA_TYPE)1024, Index2D(0, 0));
-
-  RAJA::forall<RAJA::seq_exec>(colrange, [=] (INDEX_TYPE c) {
-    for( int r = 0; r < ydim; ++r)
-    {
-      checkminloc_reducer.minloc(checkarr2D[r][c], Index2D(c, r));
-    }
-  });
-
-  Index2D raja_loc = minloc_reducer.getLoc();
-  DATA_TYPE raja_min = (DATA_TYPE)minloc_reducer.get();
-  Index2D checkraja_loc = checkminloc_reducer.getLoc();
+  RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer(
+      (DATA_TYPE)1024, Index2D(0, 0));
+
+  RAJA::forall<RAJA::seq_exec>(colrange,
+                               [=](INDEX_TYPE c)
+                               {
+                                 for (int r = 0; r < ydim; ++r)
+                                 {
+                                   checkminloc_reducer.minloc(checkarr2D[r][c],
+                                                              Index2D(c, r));
+                                 }
+                               });
+
+  Index2D raja_loc        = minloc_reducer.getLoc();
+  DATA_TYPE raja_min      = (DATA_TYPE)minloc_reducer.get();
+  Index2D checkraja_loc   = checkminloc_reducer.getLoc();
   DATA_TYPE checkraja_min = (DATA_TYPE)checkminloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_min, (DATA_TYPE)raja_min);
   ASSERT_EQ(checkraja_loc.idx, raja_loc.idx);
   ASSERT_EQ(checkraja_loc.idy, raja_loc.idy);
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
-
-  deallocateForallTestData<DATA_TYPE *> ( work_res,
-                                          workarr2D,
-                                          checkarr2D,
-                                          testarr2D
-                                        );
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
+
+  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
+                                       testarr2D);
 }
 
 
 TYPED_TEST_SUITE_P(KernelLocMin2DTest);
 template <typename T>
 class KernelLocMin2DTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelLocMin2DTest, LocMin2DKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(151, 151);
-  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                         EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                         EXEC_POLICY, REDUCE_POLICY>(151, 151);
+  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                         EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelLocMin2DTest,
-                            LocMin2DKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelLocMin2DTest, LocMin2DKernel);
 
 #endif  // __TEST_KERNEL_REDUCELOC_MIN2D_HPP__
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp
index cf0791e8d5..046ec52a6b 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp
@@ -8,56 +8,50 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MIN2DVIEW_HPP__
 #define __TEST_KERNEL_REDUCELOC_MIN2DVIEW_HPP__
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename FORALL_POLICY, typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename FORALL_POLICY,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelLocMin2DViewTestImpl(const int xdim, const int ydim)
 {
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
-  DATA_TYPE ** workarr2D;
-  DATA_TYPE ** checkarr2D;
-  DATA_TYPE ** testarr2D;
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE** workarr2D;
+  DATA_TYPE** checkarr2D;
+  DATA_TYPE** testarr2D;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE *> ( ydim,
-                                        work_res,
-                                        &workarr2D,
-                                        &checkarr2D,
-                                        &testarr2D
-                                      );
+  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
+                                     &testarr2D);
 
   // set rows to point to check and work _arrays
-  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0,ydim);
+  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
   RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-  {
-    workarr2D[zz] = work_array + zz * ydim;
-  });
+                              { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    checkarr2D[zz] = check_array + zz * ydim;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
+                               { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    for ( int xx = 0; xx < xdim; ++xx )
-    {
-      checkarr2D[zz][xx] = zz*xdim + xx + 1;
-    }
-    checkarr2D[ydim-1][xdim-1] = 0;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg,
+                               [=](INDEX_TYPE zz)
+                               {
+                                 for (int xx = 0; xx < xdim; ++xx)
+                                 {
+                                   checkarr2D[zz][xx] = zz * xdim + xx + 1;
+                                 }
+                                 checkarr2D[ydim - 1][xdim - 1] = 0;
+                               });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
@@ -66,67 +60,66 @@ void KernelLocMin2DViewTestImpl(const int xdim, const int ydim)
 
   RAJA::View<DATA_TYPE, RAJA::Layout<2>> ArrView(work_array, xdim, ydim);
 
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, Index2D> minloc_reducer((DATA_TYPE)1024, Index2D(0, 0));
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, Index2D> minloc_reducer(
+      (DATA_TYPE)1024, Index2D(0, 0));
 
-  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                           [=] RAJA_HOST_DEVICE (int c, int r) {
-                             minloc_reducer.minloc(ArrView(r, c), Index2D(c, r));
-                           });
+  RAJA::kernel<EXEC_POLICY>(
+      RAJA::make_tuple(colrange, rowrange), [=] RAJA_HOST_DEVICE(int c, int r)
+      { minloc_reducer.minloc(ArrView(r, c), Index2D(c, r)); });
 
   // CPU answer
-  RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer((DATA_TYPE)1024, Index2D(0, 0));
-
-  RAJA::forall<RAJA::seq_exec>(colrange, [=] (INDEX_TYPE c) {
-    for( int r = 0; r < ydim; ++r)
-    {
-      checkminloc_reducer.minloc(checkarr2D[r][c], Index2D(c, r));
-    }
-  });
-
-  Index2D raja_loc = minloc_reducer.getLoc();
-  DATA_TYPE raja_min = (DATA_TYPE)minloc_reducer.get();
-  Index2D checkraja_loc = checkminloc_reducer.getLoc();
+  RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer(
+      (DATA_TYPE)1024, Index2D(0, 0));
+
+  RAJA::forall<RAJA::seq_exec>(colrange,
+                               [=](INDEX_TYPE c)
+                               {
+                                 for (int r = 0; r < ydim; ++r)
+                                 {
+                                   checkminloc_reducer.minloc(checkarr2D[r][c],
+                                                              Index2D(c, r));
+                                 }
+                               });
+
+  Index2D raja_loc        = minloc_reducer.getLoc();
+  DATA_TYPE raja_min      = (DATA_TYPE)minloc_reducer.get();
+  Index2D checkraja_loc   = checkminloc_reducer.getLoc();
   DATA_TYPE checkraja_min = (DATA_TYPE)checkminloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_min, (DATA_TYPE)raja_min);
   ASSERT_EQ(checkraja_loc.idx, raja_loc.idx);
   ASSERT_EQ(checkraja_loc.idy, raja_loc.idy);
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
-
-  deallocateForallTestData<DATA_TYPE *> ( work_res,
-                                          workarr2D,
-                                          checkarr2D,
-                                          testarr2D
-                                        );
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
+
+  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
+                                       testarr2D);
 }
 
 
 TYPED_TEST_SUITE_P(KernelLocMin2DViewTest);
 template <typename T>
 class KernelLocMin2DViewTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelLocMin2DViewTest, LocMin2DViewKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(151, 151);
-  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                             EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                             EXEC_POLICY, REDUCE_POLICY>(151, 151);
+  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                             EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelLocMin2DViewTest,
-                            LocMin2DViewKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelLocMin2DViewTest, LocMin2DViewKernel);
 
 #endif  // __TEST_KERNEL_REDUCELOC_MIN2DVIEW_HPP__
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp
index 4234471f89..57016c4e00 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp
@@ -8,56 +8,50 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MIN2DVIEWTUPLE_HPP__
 #define __TEST_KERNEL_REDUCELOC_MIN2DVIEWTUPLE_HPP__
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename FORALL_POLICY, typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename FORALL_POLICY,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelLocMin2DViewTupleTestImpl(const int xdim, const int ydim)
 {
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
-  DATA_TYPE ** workarr2D;
-  DATA_TYPE ** checkarr2D;
-  DATA_TYPE ** testarr2D;
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE** workarr2D;
+  DATA_TYPE** checkarr2D;
+  DATA_TYPE** testarr2D;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE *> ( ydim,
-                                        work_res,
-                                        &workarr2D,
-                                        &checkarr2D,
-                                        &testarr2D
-                                      );
+  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
+                                     &testarr2D);
 
   // set rows to point to check and work _arrays
-  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0,ydim);
+  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
   RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-  {
-    workarr2D[zz] = work_array + zz * ydim;
-  });
+                              { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    checkarr2D[zz] = check_array + zz * ydim;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
+                               { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    for ( int xx = 0; xx < xdim; ++xx )
-    {
-      checkarr2D[zz][xx] = zz*xdim + xx + 1;
-    }
-    checkarr2D[ydim-1][xdim-1] = 0;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg,
+                               [=](INDEX_TYPE zz)
+                               {
+                                 for (int xx = 0; xx < xdim; ++xx)
+                                 {
+                                   checkarr2D[zz][xx] = zz * xdim + xx + 1;
+                                 }
+                                 checkarr2D[ydim - 1][xdim - 1] = 0;
+                               });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
@@ -68,64 +62,72 @@ void KernelLocMin2DViewTupleTestImpl(const int xdim, const int ydim)
 
   RAJA::tuple<DATA_TYPE, DATA_TYPE> LocTup(0, 0);
 
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, RAJA::tuple<DATA_TYPE, DATA_TYPE>> minloc_reducer((DATA_TYPE)1024, LocTup);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE,
+                     RAJA::tuple<DATA_TYPE, DATA_TYPE>>
+      minloc_reducer((DATA_TYPE)1024, LocTup);
 
   RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                           [=] RAJA_HOST_DEVICE (int c, int r) {
-                             minloc_reducer.minloc(ArrView(r, c), RAJA::make_tuple((DATA_TYPE)c, (DATA_TYPE)r));
-                           });
+                            [=] RAJA_HOST_DEVICE(int c, int r)
+                            {
+                              minloc_reducer.minloc(
+                                  ArrView(r, c),
+                                  RAJA::make_tuple((DATA_TYPE)c, (DATA_TYPE)r));
+                            });
 
   // CPU answer
-  RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer((DATA_TYPE)1024, Index2D(0, 0));
-
-  RAJA::forall<RAJA::seq_exec>(colrange, [=] (INDEX_TYPE c) {
-    for( int r = 0; r < ydim; ++r)
-    {
-      checkminloc_reducer.minloc(checkarr2D[r][c], Index2D(c, r));
-    }
-  });
+  RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer(
+      (DATA_TYPE)1024, Index2D(0, 0));
+
+  RAJA::forall<RAJA::seq_exec>(colrange,
+                               [=](INDEX_TYPE c)
+                               {
+                                 for (int r = 0; r < ydim; ++r)
+                                 {
+                                   checkminloc_reducer.minloc(checkarr2D[r][c],
+                                                              Index2D(c, r));
+                                 }
+                               });
 
   RAJA::tuple<DATA_TYPE, DATA_TYPE> raja_loc = minloc_reducer.getLoc();
-  DATA_TYPE raja_min = (DATA_TYPE)minloc_reducer.get();
-  Index2D checkraja_loc = checkminloc_reducer.getLoc();
+  DATA_TYPE raja_min                         = (DATA_TYPE)minloc_reducer.get();
+  Index2D checkraja_loc                      = checkminloc_reducer.getLoc();
   DATA_TYPE checkraja_min = (DATA_TYPE)checkminloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_min, (DATA_TYPE)raja_min);
   ASSERT_EQ(checkraja_loc.idx, RAJA::get<0>(raja_loc));
   ASSERT_EQ(checkraja_loc.idy, RAJA::get<1>(raja_loc));
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
-
-  deallocateForallTestData<DATA_TYPE *> ( work_res,
-                                          workarr2D,
-                                          checkarr2D,
-                                          testarr2D
-                                        );
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
+
+  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
+                                       testarr2D);
 }
 
 
 TYPED_TEST_SUITE_P(KernelLocMin2DViewTupleTest);
 template <typename T>
 class KernelLocMin2DViewTupleTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelLocMin2DViewTupleTest, LocMin2DViewTupleKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(151, 151);
-  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
+      10, 10);
+  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
+      151, 151);
+  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
+      362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelLocMin2DViewTupleTest,
diff --git a/test/functional/kernel/region/tests/test-kernel-region-data.hpp b/test/functional/kernel/region/tests/test-kernel-region-data.hpp
index f0b9f58ff6..63c696f96a 100644
--- a/test/functional/kernel/region/tests/test-kernel-region-data.hpp
+++ b/test/functional/kernel/region/tests/test-kernel-region-data.hpp
@@ -11,7 +11,9 @@
 template <typename T>
 void allocRegionTestData(int N,
                          camp::resources::Resource work_res,
-                         T** work1, T** work2, T** work3,
+                         T** work1,
+                         T** work2,
+                         T** work3,
                          camp::resources::Resource host_res,
                          T** check)
 {
@@ -24,7 +26,9 @@ void allocRegionTestData(int N,
 
 template <typename T>
 void deallocRegionTestData(camp::resources::Resource work_res,
-                           T* work1, T* work2, T* work3,
+                           T* work1,
+                           T* work2,
+                           T* work3,
                            camp::resources::Resource host_res,
                            T* check)
 {
diff --git a/test/functional/kernel/region/tests/test-kernel-region-sync.hpp b/test/functional/kernel/region/tests/test-kernel-region-sync.hpp
index b9ad122d2b..cb87a63357 100644
--- a/test/functional/kernel/region/tests/test-kernel-region-sync.hpp
+++ b/test/functional/kernel/region/tests/test-kernel-region-sync.hpp
@@ -15,28 +15,25 @@
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void KernelRegionSyncTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  camp::resources::Resource host_res{camp::resources::Host()};
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource host_res {camp::resources::Host()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
   const INDEX_TYPE N = last - first;
-  
+
   INDEX_TYPE* work_array1;
   INDEX_TYPE* work_array2;
   INDEX_TYPE* work_array3;
 
   INDEX_TYPE* check_array;
 
-  allocRegionTestData(N,
-                      work_res,
-                      &work_array1, &work_array2, &work_array3,
-                      host_res,
-                      &check_array);
+  allocRegionTestData(N, work_res, &work_array1, &work_array2, &work_array3,
+                      host_res, &check_array);
 
-  work_res.memset( work_array1, 0, sizeof(INDEX_TYPE) * N );
-  work_res.memset( work_array2, 0, sizeof(INDEX_TYPE) * N );
-  work_res.memset( work_array3, 0, sizeof(INDEX_TYPE) * N );
+  work_res.memset(work_array1, 0, sizeof(INDEX_TYPE) * N);
+  work_res.memset(work_array2, 0, sizeof(INDEX_TYPE) * N);
+  work_res.memset(work_array3, 0, sizeof(INDEX_TYPE) * N);
 
-  host_res.memset( check_array, 0, sizeof(INDEX_TYPE) * N );
+  host_res.memset(check_array, 0, sizeof(INDEX_TYPE) * N);
 
   //
   // Create a list segment with indices in reverse order from range
@@ -48,48 +45,42 @@ void KernelRegionSyncTestImpl(INDEX_TYPE first, INDEX_TYPE last)
   std::vector<INDEX_TYPE> idx_array(N);
   std::iota(idx_array.begin(), idx_array.end(), first);
   std::reverse(idx_array.begin(), idx_array.end());
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], N,
-                                          work_res);
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], N, work_res);
 
   RAJA::TypedRangeSegment<INDEX_TYPE> rseg(first, last);
 
   RAJA::kernel<EXEC_POLICY>(
 
-    RAJA::make_tuple(rseg, lseg),
+      RAJA::make_tuple(rseg, lseg),
 
-    [=] (INDEX_TYPE i) {
-      work_array1[i - first] = 50;
-    },
+      [=](INDEX_TYPE i) { work_array1[i - first] = 50; },
 
-    [=] (INDEX_TYPE i) {
-      work_array2[i - first] = 100;
-    },
+      [=](INDEX_TYPE i) { work_array2[i - first] = 100; },
 
-    [=] (INDEX_TYPE i) {
-      work_array3[i - first] = work_array1[i - first] + 
-                               work_array2[i - first] + 1;
-    }
+      [=](INDEX_TYPE i)
+      {
+        work_array3[i - first] =
+            work_array1[i - first] + work_array2[i - first] + 1;
+      }
 
   );
-  
+
   work_res.memcpy(check_array, work_array3, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(check_array[i], 151);
   }
 
-  deallocRegionTestData(work_res,
-                        work_array1, work_array2, work_array3,
-                        host_res,
-                        check_array);
+  deallocRegionTestData(work_res, work_array1, work_array2, work_array3,
+                        host_res, check_array);
 }
 
 
 TYPED_TEST_SUITE_P(KernelRegionSyncTest);
 template <typename T>
 class KernelRegionSyncTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelRegionSyncTest, RegionSyncKernel)
 {
@@ -102,7 +93,6 @@ TYPED_TEST_P(KernelRegionSyncTest, RegionSyncKernel)
   KernelRegionSyncTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(3, 2556);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelRegionSyncTest,
-                            RegionSyncKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelRegionSyncTest, RegionSyncKernel);
 
 #endif  // __TEST_KERNEL_REGION_SYNC_HPP__
diff --git a/test/functional/kernel/region/tests/test-kernel-region.hpp b/test/functional/kernel/region/tests/test-kernel-region.hpp
index bb2ec449e0..e444da11ba 100644
--- a/test/functional/kernel/region/tests/test-kernel-region.hpp
+++ b/test/functional/kernel/region/tests/test-kernel-region.hpp
@@ -11,69 +11,61 @@
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void KernelRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  camp::resources::Resource host_res{camp::resources::Host()};
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource host_res {camp::resources::Host()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
   const INDEX_TYPE N = last - first;
-  
+
   INDEX_TYPE* work_array1;
   INDEX_TYPE* work_array2;
   INDEX_TYPE* work_array3;
 
   INDEX_TYPE* check_array;
 
-  allocRegionTestData(N,
-                      work_res,
-                      &work_array1, &work_array2, &work_array3,
-                      host_res,
-                      &check_array);
+  allocRegionTestData(N, work_res, &work_array1, &work_array2, &work_array3,
+                      host_res, &check_array);
 
-  work_res.memset( work_array1, 0, sizeof(INDEX_TYPE) * N );
-  work_res.memset( work_array2, 0, sizeof(INDEX_TYPE) * N );
-  work_res.memset( work_array3, 0, sizeof(INDEX_TYPE) * N );
+  work_res.memset(work_array1, 0, sizeof(INDEX_TYPE) * N);
+  work_res.memset(work_array2, 0, sizeof(INDEX_TYPE) * N);
+  work_res.memset(work_array3, 0, sizeof(INDEX_TYPE) * N);
 
-  host_res.memset( check_array, 0, sizeof(INDEX_TYPE) * N );
+  host_res.memset(check_array, 0, sizeof(INDEX_TYPE) * N);
 
 
   RAJA::TypedRangeSegment<INDEX_TYPE> rseg(first, last);
 
   RAJA::kernel<EXEC_POLICY>(
 
-    RAJA::make_tuple(rseg),
+      RAJA::make_tuple(rseg),
 
-    [=] (INDEX_TYPE i) {
-      work_array1[i - first] = 50;
-    },
+      [=](INDEX_TYPE i) { work_array1[i - first] = 50; },
 
-    [=] (INDEX_TYPE i) {
-      work_array2[i - first] = 100;
-    },
+      [=](INDEX_TYPE i) { work_array2[i - first] = 100; },
 
-    [=] (INDEX_TYPE i) {
-      work_array3[i - first] = work_array1[i - first] + 
-                               work_array2[i - first] + 1;
-    }
+      [=](INDEX_TYPE i)
+      {
+        work_array3[i - first] =
+            work_array1[i - first] + work_array2[i - first] + 1;
+      }
 
   );
-  
-  work_res.memcpy(check_array, work_array3, sizeof(INDEX_TYPE) * N );
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  work_res.memcpy(check_array, work_array3, sizeof(INDEX_TYPE) * N);
+
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(check_array[i], 151);
   }
 
-  deallocRegionTestData(work_res,
-                        work_array1, work_array2, work_array3,
-                        host_res,
-                        check_array);
+  deallocRegionTestData(work_res, work_array1, work_array2, work_array3,
+                        host_res, check_array);
 }
 
 
 TYPED_TEST_SUITE_P(KernelRegionTest);
 template <typename T>
 class KernelRegionTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelRegionTest, RegionKernel)
 {
@@ -86,7 +78,6 @@ TYPED_TEST_P(KernelRegionTest, RegionKernel)
   KernelRegionTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(3, 2556);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelRegionTest,
-                            RegionKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelRegionTest, RegionKernel);
 
 #endif  // __TEST_KERNEL_REGION_HPP__
diff --git a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp b/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp
index 82e749d226..48185fe281 100644
--- a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp
+++ b/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp
@@ -11,8 +11,9 @@
 //
 // Value struct for manipulating tile sizes in parameterized tests.
 //
-template<int VALUE>
-struct Value {
+template <int VALUE>
+struct Value
+{
   static constexpr int value = VALUE;
 };
 
@@ -23,58 +24,57 @@ void KernelSingleLoopForICountTestImpl(IDX_TYPE N, IDX_TYPE tsize)
 
   RAJA::ReduceSum<REDUCE_POLICY, IDX_TYPE> trip_count(0);
 
-  for (IDX_TYPE t = 0; t < tsize; ++t) {
+  for (IDX_TYPE t = 0; t < tsize; ++t)
+  {
 
     RAJA::ReduceSum<REDUCE_POLICY, IDX_TYPE> tile_count(0);
 
     RAJA::kernel_param<EXEC_POLICY>(
-      RAJA::make_tuple( RAJA::TypedRangeSegment<IDX_TYPE>(0, N) ),
-      RAJA::make_tuple( static_cast<IDX_TYPE>(0) ),
-
-      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE ii) {
-        trip_count += 1;
-        if ( i % tsize == t && ii == t ) { 
-          tile_count += 1;
-        }
-      }
-    );
+        RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, N)),
+        RAJA::make_tuple(static_cast<IDX_TYPE>(0)),
+
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE ii)
+        {
+          trip_count += 1;
+          if (i % tsize == t && ii == t)
+          {
+            tile_count += 1;
+          }
+        });
 
     IDX_TYPE trip_result = trip_count.get();
-    ASSERT_EQ( trip_result, (t+1) * N );
+    ASSERT_EQ(trip_result, (t + 1) * N);
 
     IDX_TYPE tile_result = tile_count.get();
 
     IDX_TYPE tile_expect = N / tsize;
-    if ( t < N % tsize ) {
+    if (t < N % tsize)
+    {
       tile_expect += 1;
     }
     ASSERT_EQ(tile_result, tile_expect);
-
   }
-
 }
 
 
 TYPED_TEST_SUITE_P(KernelSingleLoopForICountTest);
 template <typename T>
 class KernelSingleLoopForICountTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(KernelSingleLoopForICountTest, ForICountSingleLoopKernel)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<1>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<1>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   IDX_TYPE tsize = camp::at_v<TypeParam, 3>::value;
 
   KernelSingleLoopForICountTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
-    IDX_TYPE(57), tsize);
+      IDX_TYPE(57), tsize);
   KernelSingleLoopForICountTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
-    IDX_TYPE(1035), tsize);
-
+      IDX_TYPE(1035), tsize);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelSingleLoopForICountTest,
diff --git a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp b/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp
index e745a8d08b..078ee61cf6 100644
--- a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp
+++ b/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp
@@ -11,8 +11,9 @@
 //
 // Value struct for manipulating tile sizes in parameterized tests.
 //
-template<int VALUE>
-struct Value {
+template <int VALUE>
+struct Value
+{
   static constexpr int value = VALUE;
 };
 
@@ -25,58 +26,57 @@ void KernelSingleLoopTileTCountTestImpl(IDX_TYPE N, IDX_TYPE tsize)
 
   RAJA::ReduceSum<REDUCE_POLICY, IDX_TYPE> trip_count(0);
 
-  for (IDX_TYPE t = 0; t < NT; ++t) {
+  for (IDX_TYPE t = 0; t < NT; ++t)
+  {
 
     RAJA::ReduceSum<REDUCE_POLICY, IDX_TYPE> tile_count(0);
 
     RAJA::kernel_param<EXEC_POLICY>(
-      RAJA::make_tuple( RAJA::TypedRangeSegment<IDX_TYPE>(0, N) ),
-      RAJA::make_tuple( static_cast<IDX_TYPE>(0) ),
-
-      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE ti) {
-        trip_count += 1;
-        if ( i / tsize == t && ti == t ) {
-          tile_count += 1;
-        }
-      }
-    );
+        RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, N)),
+        RAJA::make_tuple(static_cast<IDX_TYPE>(0)),
+
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE ti)
+        {
+          trip_count += 1;
+          if (i / tsize == t && ti == t)
+          {
+            tile_count += 1;
+          }
+        });
 
     IDX_TYPE trip_result = trip_count.get();
-    ASSERT_EQ( trip_result, (t+1) * N );
+    ASSERT_EQ(trip_result, (t + 1) * N);
 
     IDX_TYPE tile_result = tile_count.get();
 
     IDX_TYPE tile_expect = tsize;
-    if ( (t + 1) * tsize > N ) {
+    if ((t + 1) * tsize > N)
+    {
       tile_expect = N - t * tsize;
     }
     ASSERT_EQ(tile_result, tile_expect);
-
   }
-
 }
 
 
 TYPED_TEST_SUITE_P(KernelSingleLoopTileTCountTest);
 template <typename T>
 class KernelSingleLoopTileTCountTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(KernelSingleLoopTileTCountTest, TileTCountSingleLoopKernel)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<1>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<1>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   IDX_TYPE tsize = camp::at_v<TypeParam, 3>::value;
 
   KernelSingleLoopTileTCountTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
-    IDX_TYPE(57), tsize);
+      IDX_TYPE(57), tsize);
   KernelSingleLoopTileTCountTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
-    IDX_TYPE(1035), tsize);
-
+      IDX_TYPE(1035), tsize);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelSingleLoopTileTCountTest,
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp
index ccb57cfc62..5a28fbd523 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp
@@ -10,136 +10,128 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY>
 void KernelTileDynamic2DTestImpl(const int rows, const int cols)
 {
   // This test emulates matrix transposition with tiling.
 
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // holds transposed matrices
-  DATA_TYPE * work_array_t;
-  DATA_TYPE * check_array_t;
-  DATA_TYPE * test_array_t;
+  DATA_TYPE* work_array_t;
+  DATA_TYPE* check_array_t;
+  DATA_TYPE* test_array_t;
 
   INDEX_TYPE array_length = rows * cols;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
-
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array_t,
-                                      &check_array_t,
-                                      &test_array_t
-                                    );
-
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView( test_array, rows, cols );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView( test_array_t, cols, rows );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView( work_array, rows, cols );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkTView( work_array_t, cols, rows );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> CheckTView( check_array_t, cols, rows );
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
+
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array_t,
+                                    &check_array_t, &test_array_t);
+
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView(test_array, rows, cols);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView(test_array_t, cols, rows);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView(work_array, rows, cols);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkTView(work_array_t, cols, rows);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> CheckTView(check_array_t, cols, rows);
 
   // initialize arrays
-  std::iota( test_array, test_array + array_length, 1 );
-  std::iota( test_array_t, test_array_t + array_length, 1 );
+  std::iota(test_array, test_array + array_length, 1);
+  std::iota(test_array_t, test_array_t + array_length, 1);
 
-  work_res.memcpy( work_array, test_array, sizeof(DATA_TYPE) * array_length );
-  work_res.memcpy( work_array_t, test_array_t, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(work_array, test_array, sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy(work_array_t, test_array_t, sizeof(DATA_TYPE) * array_length);
 
   // transpose test_array on CPU
-  for ( int rr = 0; rr < rows; ++rr )
+  for (int rr = 0; rr < rows; ++rr)
   {
-    for ( int cc = 0; cc < cols; ++cc )
+    for (int cc = 0; cc < cols; ++cc)
     {
-      HostTView( cc, rr ) = HostView( rr, cc ); 
+      HostTView(cc, rr) = HostView(rr, cc);
     }
   }
 
   // transpose work_array
-  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange( 0, rows );
-  RAJA::TypedRangeSegment<INDEX_TYPE> colrange( 0, cols );
+  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, rows);
+  RAJA::TypedRangeSegment<INDEX_TYPE> colrange(0, cols);
 
-  RAJA::kernel_param<EXEC_POLICY> (
-    RAJA::make_tuple( colrange, rowrange ),
-    RAJA::make_tuple( RAJA::TileSize{tile_dim_x}, RAJA::TileSize{tile_dim_y} ),
-    [=] RAJA_HOST_DEVICE ( INDEX_TYPE cc, INDEX_TYPE rr ) {
-      WorkTView( cc, rr ) = WorkView( rr, cc );
-  });
+  RAJA::kernel_param<EXEC_POLICY>(
+      RAJA::make_tuple(colrange, rowrange),
+      RAJA::make_tuple(RAJA::TileSize {tile_dim_x},
+                       RAJA::TileSize {tile_dim_y}),
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
+      { WorkTView(cc, rr) = WorkView(rr, cc); });
 
-  work_res.memcpy( check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(check_array_t, work_array_t,
+                  sizeof(DATA_TYPE) * array_length);
 
-  for ( int rr = 0; rr < rows; ++rr )
+  for (int rr = 0; rr < rows; ++rr)
   {
-    for ( int cc = 0; cc < cols; ++cc )
+    for (int cc = 0; cc < cols; ++cc)
     {
       ASSERT_EQ(CheckTView(cc, rr), HostTView(cc, rr));
     }
   }
 
   // reset check and work transpose arrays
-  work_res.memcpy( check_array_t, test_array, sizeof(DATA_TYPE) * array_length );
-  work_res.memcpy( work_array_t, test_array, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(check_array_t, test_array, sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy(work_array_t, test_array, sizeof(DATA_TYPE) * array_length);
 
   // transpose work_array again with different tile sizes
-  RAJA::kernel_param<EXEC_POLICY> (
-    RAJA::make_tuple( colrange, rowrange ),
-    RAJA::make_tuple( RAJA::TileSize{tile_dim_x}, RAJA::TileSize{tile_dim_y/2} ),
-    [=] RAJA_HOST_DEVICE ( INDEX_TYPE cc, INDEX_TYPE rr ) {
-      WorkTView( cc, rr ) = WorkView( rr, cc );
-  });
+  RAJA::kernel_param<EXEC_POLICY>(
+      RAJA::make_tuple(colrange, rowrange),
+      RAJA::make_tuple(RAJA::TileSize {tile_dim_x},
+                       RAJA::TileSize {tile_dim_y / 2}),
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
+      { WorkTView(cc, rr) = WorkView(rr, cc); });
 
-  work_res.memcpy( check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(check_array_t, work_array_t,
+                  sizeof(DATA_TYPE) * array_length);
 
-  for ( int rr = 0; rr < rows; ++rr )
+  for (int rr = 0; rr < rows; ++rr)
   {
-    for ( int cc = 0; cc < cols; ++cc )
+    for (int cc = 0; cc < cols; ++cc)
     {
       ASSERT_EQ(CheckTView(cc, rr), HostTView(cc, rr));
     }
   }
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
-
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array_t,
-                                        check_array_t,
-                                        test_array_t
-                                      );
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
+
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array_t, check_array_t,
+                                      test_array_t);
 }
 
 
 TYPED_TEST_SUITE_P(KernelTileDynamic2DTest);
 template <typename T>
 class KernelTileDynamic2DTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelTileDynamic2DTest, TileDynamic2DKernel)
 {
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  KernelTileDynamic2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(10, 10);
-  KernelTileDynamic2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(151, 111);
-  KernelTileDynamic2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(362, 362);
+  KernelTileDynamic2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(
+      10, 10);
+  KernelTileDynamic2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(
+      151, 111);
+  KernelTileDynamic2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(
+      362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelTileDynamic2DTest,
-                            TileDynamic2DKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelTileDynamic2DTest, TileDynamic2DKernel);
 
 #endif  // __TEST_KERNEL_TILE_DYNAMIC2D_HPP__
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp
index 9013e5c9ea..0bfe064bbd 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp
@@ -10,112 +10,102 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY>
 void KernelTileFixed2DTestImpl(const int rows, const int cols)
 {
   // This test emulates matrix transposition with tiling.
 
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // holds transposed matrices
-  DATA_TYPE * work_array_t;
-  DATA_TYPE * check_array_t;
-  DATA_TYPE * test_array_t;
+  DATA_TYPE* work_array_t;
+  DATA_TYPE* check_array_t;
+  DATA_TYPE* test_array_t;
 
   INDEX_TYPE array_length = rows * cols;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
-
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array_t,
-                                      &check_array_t,
-                                      &test_array_t
-                                    );
-
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView( test_array, rows, cols );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView( test_array_t, cols, rows );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView( work_array, rows, cols );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkTView( work_array_t, cols, rows );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> CheckTView( check_array_t, cols, rows );
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
+
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array_t,
+                                    &check_array_t, &test_array_t);
+
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView(test_array, rows, cols);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView(test_array_t, cols, rows);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView(work_array, rows, cols);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkTView(work_array_t, cols, rows);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> CheckTView(check_array_t, cols, rows);
 
   // initialize arrays
-  std::iota( test_array, test_array + array_length, 1 );
-  std::iota( test_array_t, test_array_t + array_length, 1 );
+  std::iota(test_array, test_array + array_length, 1);
+  std::iota(test_array_t, test_array_t + array_length, 1);
 
-  work_res.memcpy( work_array, test_array, sizeof(DATA_TYPE) * array_length );
-  work_res.memcpy( work_array_t, test_array_t, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(work_array, test_array, sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy(work_array_t, test_array_t, sizeof(DATA_TYPE) * array_length);
 
   // transpose test_array on CPU
-  for ( int rr = 0; rr < rows; ++rr )
+  for (int rr = 0; rr < rows; ++rr)
   {
-    for ( int cc = 0; cc < cols; ++cc )
+    for (int cc = 0; cc < cols; ++cc)
     {
-      HostTView( cc, rr ) = HostView( rr, cc ); 
+      HostTView(cc, rr) = HostView(rr, cc);
     }
   }
 
   // transpose work_array
-  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange( 0, rows );
-  RAJA::TypedRangeSegment<INDEX_TYPE> colrange( 0, cols );
+  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, rows);
+  RAJA::TypedRangeSegment<INDEX_TYPE> colrange(0, cols);
 
-  RAJA::kernel<EXEC_POLICY> ( RAJA::make_tuple( colrange, rowrange ),
-    [=] RAJA_HOST_DEVICE ( INDEX_TYPE cc, INDEX_TYPE rr ) {
-      WorkTView( cc, rr ) = WorkView( rr, cc );
-  });
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
+                            { WorkTView(cc, rr) = WorkView(rr, cc); });
 
-  work_res.memcpy( check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(check_array_t, work_array_t,
+                  sizeof(DATA_TYPE) * array_length);
 
-  for ( int rr = 0; rr < rows; ++rr )
+  for (int rr = 0; rr < rows; ++rr)
   {
-    for ( int cc = 0; cc < cols; ++cc )
+    for (int cc = 0; cc < cols; ++cc)
     {
       ASSERT_EQ(CheckTView(cc, rr), HostTView(cc, rr));
     }
   }
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
-
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array_t,
-                                        check_array_t,
-                                        test_array_t
-                                      );
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
+
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array_t, check_array_t,
+                                      test_array_t);
 }
 
 
 TYPED_TEST_SUITE_P(KernelTileFixed2DTest);
 template <typename T>
 class KernelTileFixed2DTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelTileFixed2DTest, TileFixed2DKernel)
 {
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  KernelTileFixed2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(10, 10);
-  KernelTileFixed2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(151, 111);
-  KernelTileFixed2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(362, 362);
+  KernelTileFixed2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(
+      10, 10);
+  KernelTileFixed2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(
+      151, 111);
+  KernelTileFixed2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(
+      362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelTileFixed2DTest,
-                            TileFixed2DKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelTileFixed2DTest, TileFixed2DKernel);
 
 #endif  // __TEST_KERNEL_TILE_FIXED2D_HPP__
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp
index ac876065a1..1dcda30f9e 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp
@@ -11,42 +11,42 @@
 #include <numeric>
 #include <vector>
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelTileFixed2DMinMaxTestImpl(const int rows, const int cols)
 {
   // This test reduces min and max with tiling.
 
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   INDEX_TYPE array_length = rows * cols;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
   // initialize arrays
-  std::iota( test_array, test_array + array_length, 1 );
+  std::iota(test_array, test_array + array_length, 1);
 
   // set min and max of the array
   test_array[4] = -1;
-  test_array[8] = array_length+2;
+  test_array[8] = array_length + 2;
 
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView( work_array, rows, cols );
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView(work_array, rows, cols);
 
-  work_res.memcpy( work_array, test_array, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(work_array, test_array, sizeof(DATA_TYPE) * array_length);
 
-  RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> workmin( DATA_TYPE(99999) ); 
-  RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> workmax( DATA_TYPE(-1) ); 
+  RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> workmin(DATA_TYPE(99999));
+  RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> workmax(DATA_TYPE(-1));
 
   // mixed range types
-  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange( 0, rows );
+  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, rows);
 
   std::vector<INDEX_TYPE> colidx;
   for (INDEX_TYPE ii = INDEX_TYPE(0); ii < static_cast<INDEX_TYPE>(cols); ++ii)
@@ -54,43 +54,45 @@ void KernelTileFixed2DMinMaxTestImpl(const int rows, const int cols)
     colidx.push_back(ii);
   }
 
-  RAJA::TypedListSegment<INDEX_TYPE> colrange( &colidx[0], colidx.size(), work_res );
+  RAJA::TypedListSegment<INDEX_TYPE> colrange(&colidx[0], colidx.size(),
+                                              work_res);
 
   // find min and max on target platform
-  RAJA::kernel<EXEC_POLICY> ( RAJA::make_tuple( colrange, rowrange ),
-    [=] RAJA_HOST_DEVICE ( INDEX_TYPE cc, INDEX_TYPE rr ) {
-      workmin.min(WorkView(rr, cc));
-      workmax.max(WorkView(rr, cc));
-  });
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
+                            {
+                              workmin.min(WorkView(rr, cc));
+                              workmax.max(WorkView(rr, cc));
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(-1), static_cast<DATA_TYPE>(workmin.get()));
-  ASSERT_EQ(static_cast<DATA_TYPE>(array_length+2), static_cast<DATA_TYPE>(workmax.get()));
+  ASSERT_EQ(static_cast<DATA_TYPE>(array_length + 2),
+            static_cast<DATA_TYPE>(workmax.get()));
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 }
 
 
 TYPED_TEST_SUITE_P(KernelTileFixed2DMinMaxTest);
 template <typename T>
 class KernelTileFixed2DMinMaxTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelTileFixed2DMinMaxTest, TileFixed2DMinMaxKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(151, 111);
-  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  EXEC_POLICY, REDUCE_POLICY>(151, 111);
+  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelTileFixed2DMinMaxTest,
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp
index 33da6d3c7d..6304b1500f 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp
@@ -12,13 +12,17 @@
 #include <vector>
 #include <type_traits>
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelTileFixed2DSumTestImpl(const int rowsin, const int colsin)
 {
   // This test reduces sums with tiling.
 
   int rows, cols;
-  if ( std::is_same<DATA_TYPE, float>::value )
+  if (std::is_same<DATA_TYPE, float>::value)
   {
     // Restrict to a small data size for better float precision.
     rows = 3;
@@ -30,20 +34,20 @@ void KernelTileFixed2DSumTestImpl(const int rowsin, const int colsin)
     cols = colsin;
   }
 
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
   DATA_TYPE hostsum = 0;
 
-  RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> worksum( DATA_TYPE(0) ); 
+  RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> worksum(DATA_TYPE(0));
 
   // sum on CPU in a tiled manner
-  for ( int rr = 0; rr < rows; rr += tile_dim_x )
+  for (int rr = 0; rr < rows; rr += tile_dim_x)
   {
-    for ( int cc = 0; cc < cols; cc += tile_dim_y )
+    for (int cc = 0; cc < cols; cc += tile_dim_y)
     {
-      for ( int r = rr; r < std::min(rr+tile_dim_x, rows); ++r )
+      for (int r = rr; r < std::min(rr + tile_dim_x, rows); ++r)
       {
-        for ( int c = cc; c < std::min(cc+tile_dim_y, cols); ++c )
+        for (int c = cc; c < std::min(cc + tile_dim_y, cols); ++c)
         {
           hostsum += (DATA_TYPE)(r * 1.1 + c);
         }
@@ -52,7 +56,7 @@ void KernelTileFixed2DSumTestImpl(const int rowsin, const int colsin)
   }
 
   // mixed range types
-  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange( 0, rows );
+  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, rows);
 
   std::vector<INDEX_TYPE> colidx;
   for (INDEX_TYPE ii = INDEX_TYPE(0); ii < static_cast<INDEX_TYPE>(cols); ++ii)
@@ -60,13 +64,13 @@ void KernelTileFixed2DSumTestImpl(const int rowsin, const int colsin)
     colidx.push_back(ii);
   }
 
-  RAJA::TypedListSegment<INDEX_TYPE> colrange( &colidx[0], colidx.size(), work_res );
+  RAJA::TypedListSegment<INDEX_TYPE> colrange(&colidx[0], colidx.size(),
+                                              work_res);
 
   // sum on target platform
-  RAJA::kernel<EXEC_POLICY> ( RAJA::make_tuple( colrange, rowrange ),
-    [=] RAJA_HOST_DEVICE ( INDEX_TYPE cc, INDEX_TYPE rr ) {
-      worksum += (DATA_TYPE)(rr * 1.1 + cc);
-  });
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
+                            { worksum += (DATA_TYPE)(rr * 1.1 + cc); });
 
   ASSERT_FLOAT_EQ(hostsum, (DATA_TYPE)worksum.get());
 }
@@ -75,23 +79,24 @@ void KernelTileFixed2DSumTestImpl(const int rowsin, const int colsin)
 TYPED_TEST_SUITE_P(KernelTileFixed2DSumTest);
 template <typename T>
 class KernelTileFixed2DSumTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelTileFixed2DSumTest, TileFixed2DSumKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(151, 111);
-  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                               REDUCE_POLICY>(10, 10);
+  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                               REDUCE_POLICY>(151, 111);
+  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                               REDUCE_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelTileFixed2DSumTest,
-                            TileFixed2DSumKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelTileFixed2DSumTest, TileFixed2DSumKernel);
 
 #endif  // __TEST_KERNEL_TILE_FIXED2DSUM_HPP__
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp
index 017512c50c..2bfe44934e 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp
@@ -10,121 +10,114 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY>
 void KernelTileLocalArray2DTestImpl(const int rows, const int cols)
 {
   // This test emulates matrix transposition with tiling.
 
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // holds transposed matrices
-  DATA_TYPE * work_array_t;
-  DATA_TYPE * check_array_t;
-  DATA_TYPE * test_array_t;
+  DATA_TYPE* work_array_t;
+  DATA_TYPE* check_array_t;
+  DATA_TYPE* test_array_t;
 
   INDEX_TYPE array_length = rows * cols;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
-
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array_t,
-                                      &check_array_t,
-                                      &test_array_t
-                                    );
-
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView( test_array, rows, cols );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView( test_array_t, cols, rows );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView( work_array, rows, cols );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkTView( work_array_t, cols, rows );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> CheckTView( check_array_t, cols, rows );
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
+
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array_t,
+                                    &check_array_t, &test_array_t);
+
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView(test_array, rows, cols);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView(test_array_t, cols, rows);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView(work_array, rows, cols);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkTView(work_array_t, cols, rows);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> CheckTView(check_array_t, cols, rows);
 
   // initialize local array (shared mem)
-  using TILE_MEM = RAJA::LocalArray<DATA_TYPE, RAJA::Perm<0,1>, RAJA::SizeList<tile_dim_x, tile_dim_y>>;
+  using TILE_MEM = RAJA::LocalArray<DATA_TYPE, RAJA::Perm<0, 1>,
+                                    RAJA::SizeList<tile_dim_x, tile_dim_y>>;
   TILE_MEM Tile_Array;
 
   // initialize arrays
-  std::iota( test_array, test_array + array_length, 1 );
-  std::iota( test_array_t, test_array_t + array_length, 1 );
+  std::iota(test_array, test_array + array_length, 1);
+  std::iota(test_array_t, test_array_t + array_length, 1);
 
-  work_res.memcpy( work_array, test_array, sizeof(DATA_TYPE) * array_length );
-  work_res.memcpy( work_array_t, test_array_t, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(work_array, test_array, sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy(work_array_t, test_array_t, sizeof(DATA_TYPE) * array_length);
 
   // transpose test_array on CPU
-  for ( int rr = 0; rr < rows; ++rr )
+  for (int rr = 0; rr < rows; ++rr)
   {
-    for ( int cc = 0; cc < cols; ++cc )
+    for (int cc = 0; cc < cols; ++cc)
     {
-      HostTView( cc, rr ) = HostView( rr, cc ); 
+      HostTView(cc, rr) = HostView(rr, cc);
     }
   }
 
   // transpose work_array
-  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange( 0, rows );
-  RAJA::TypedRangeSegment<INDEX_TYPE> colrange( 0, cols );
+  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, rows);
+  RAJA::TypedRangeSegment<INDEX_TYPE> colrange(0, cols);
 
-  RAJA::kernel_param<EXEC_POLICY> ( RAJA::make_tuple( colrange, rowrange ), RAJA::make_tuple( (INDEX_TYPE)0, (INDEX_TYPE)0, Tile_Array ),
-    [=] RAJA_HOST_DEVICE ( INDEX_TYPE cc, INDEX_TYPE rr, INDEX_TYPE tx, INDEX_TYPE ty, TILE_MEM &Tile_Array ) {
-      Tile_Array( ty, tx ) = WorkView( rr, cc );
-    },
+  RAJA::kernel_param<EXEC_POLICY>(
+      RAJA::make_tuple(colrange, rowrange),
+      RAJA::make_tuple((INDEX_TYPE)0, (INDEX_TYPE)0, Tile_Array),
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr, INDEX_TYPE tx,
+                           INDEX_TYPE ty, TILE_MEM & Tile_Array)
+      { Tile_Array(ty, tx) = WorkView(rr, cc); },
 
-    [=] RAJA_HOST_DEVICE ( INDEX_TYPE cc, INDEX_TYPE rr, INDEX_TYPE tx, INDEX_TYPE ty, TILE_MEM &Tile_Array ) {
-      WorkTView( cc, rr ) = Tile_Array( ty, tx );
-    }
-  );
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr, INDEX_TYPE tx,
+                           INDEX_TYPE ty, TILE_MEM & Tile_Array)
+      { WorkTView(cc, rr) = Tile_Array(ty, tx); });
 
-  work_res.memcpy( check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(check_array_t, work_array_t,
+                  sizeof(DATA_TYPE) * array_length);
 
-  for ( int rr = 0; rr < rows; ++rr )
+  for (int rr = 0; rr < rows; ++rr)
   {
-    for ( int cc = 0; cc < cols; ++cc )
+    for (int cc = 0; cc < cols; ++cc)
     {
       ASSERT_EQ(CheckTView(cc, rr), HostTView(cc, rr));
     }
   }
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
-
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array_t,
-                                        check_array_t,
-                                        test_array_t
-                                      );
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
+
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array_t, check_array_t,
+                                      test_array_t);
 }
 
 
 TYPED_TEST_SUITE_P(KernelTileLocalArray2DTest);
 template <typename T>
 class KernelTileLocalArray2DTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelTileLocalArray2DTest, TileLocalArray2DKernel)
 {
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(10, 10);
-  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(151, 111);
-  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(362, 362);
+  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                 EXEC_POLICY>(10, 10);
+  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                 EXEC_POLICY>(151, 111);
+  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                 EXEC_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelTileLocalArray2DTest,
-                            TileLocalArray2DKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelTileLocalArray2DTest, TileLocalArray2DKernel);
 
 #endif  // __TEST_KERNEL_TILE_LOCALARRAY2D_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp
index 13f9c62a45..c1d3daad25 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp
@@ -17,11 +17,13 @@
 //
 TYPED_TEST_SUITE_P(KernelWarpThreadReduceMaskTest);
 template <typename T>
-class KernelWarpThreadReduceMaskTest : public ::testing::Test {};
+class KernelWarpThreadReduceMaskTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -31,12 +33,14 @@ TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = true;
 
   // For double nested loop tests the third arg is ignored.
-  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 64, 4*123 );
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 64, 4 * 123);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadReduceMaskTest,
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp
index cda8aaba59..bf1dda57af 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp
@@ -17,11 +17,13 @@
 //
 TYPED_TEST_SUITE_P(KernelWarpThreadReduceWarpTest);
 template <typename T>
-class KernelWarpThreadReduceWarpTest : public ::testing::Test {};
+class KernelWarpThreadReduceWarpTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -31,13 +33,15 @@ TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = true;
 
   // For double nested loop tests the third arg is ignored.
   // Integer argument needs to be divisible by 10, and 16.
-  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 4000 );
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 4000);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadReduceWarpTest,
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp
index f3194fba44..7181c5f5a2 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp
@@ -17,11 +17,13 @@
 //
 TYPED_TEST_SUITE_P(KernelWarpThreadWarpLoopTest);
 template <typename T>
-class KernelWarpThreadWarpLoopTest : public ::testing::Test {};
+class KernelWarpThreadWarpLoopTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -31,12 +33,14 @@ TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = true;
 
   // For double nested loop tests the third arg is ignored.
-  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 2345 );
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 2345);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadWarpLoopTest,
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp
index 08e9a0c381..24c7f294ca 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp
@@ -17,11 +17,13 @@
 //
 TYPED_TEST_SUITE_P(KernelWarpThreadReduceMaskTest);
 template <typename T>
-class KernelWarpThreadReduceMaskTest : public ::testing::Test {};
+class KernelWarpThreadReduceMaskTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -31,12 +33,14 @@ TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = false;
 
   // For double nested loop tests the third arg is ignored.
-  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 64, 4*123 );
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 64, 4 * 123);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadReduceMaskTest,
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp
index e61c05446c..6690efd2f9 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp
@@ -17,11 +17,13 @@
 //
 TYPED_TEST_SUITE_P(KernelWarpThreadReduceWarpTest);
 template <typename T>
-class KernelWarpThreadReduceWarpTest : public ::testing::Test {};
+class KernelWarpThreadReduceWarpTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -31,13 +33,15 @@ TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = false;
 
   // For double nested loop tests the third arg is ignored.
   // Integer argument needs to be divisible by 10, and 16.
-  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 4000 );
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 4000);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadReduceWarpTest,
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp
index c435c484b2..ba8f38e64c 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp
@@ -17,11 +17,13 @@
 //
 TYPED_TEST_SUITE_P(KernelWarpThreadWarpLoopTest);
 template <typename T>
-class KernelWarpThreadWarpLoopTest : public ::testing::Test {};
+class KernelWarpThreadWarpLoopTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -31,12 +33,14 @@ TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = false;
 
   // For double nested loop tests the third arg is ignored.
-  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 2345);
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 2345);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadWarpLoopTest,
diff --git a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp
index 797379e890..8809630544 100644
--- a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp
+++ b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp
@@ -10,38 +10,53 @@
 
 #include <numeric>
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_resource<EXEC_POL>( segs, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
+{
+  RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args) {
-  RAJA::kernel<EXEC_POL>( segs, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel<EXEC_POL>(segs, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_param_resource<EXEC_POL>( segs, params, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel_param(SEGMENTS&& segs,
+                  PARAMS&& params,
+                  WORKING_RES work_res,
+                  Args&&... args)
+{
+  RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args) {
-  RAJA::kernel_param<EXEC_POL>( segs, params, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel_param<EXEC_POL>(segs, params, args...);
 }
 
 //
@@ -49,97 +64,99 @@ typename std::enable_if< !USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs
 // Define list of nested loop types the ReduceMask test supports.
 //
 //
-using ReduceMaskSupportedLoopTypeList = camp::list<
-  DEVICE_DEPTH_2_REDUCESUM_WARPMASK,
-  DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI
->;
+using ReduceMaskSupportedLoopTypeList =
+    camp::list<DEVICE_DEPTH_2_REDUCESUM_WARPMASK,
+               DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI>;
 
 //
 //
 // Sum of array of elements with GPU-specific policies.
 //
 //
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
 void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK&,
                           const RAJA::Index_type directlen,
                           const RAJA::Index_type looplen)
 {
-  WORKING_RES work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+  WORKING_RES work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(directlen*looplen,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(directlen * looplen, erased_work_res,
+                                           &work_array, &check_array,
+                                           &test_array);
 
   RAJA::ReduceMax<REDUCE_POL, int> max_thread(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> trip_count(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
 
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
-                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, directlen), RAJA::TypedRangeSegment<RAJA::Index_type>(0, looplen)),
-                            work_res,
-                            [=] RAJA_DEVICE (RAJA::Index_type i, RAJA::Index_type RAJA_UNUSED_ARG(j)) {
-                              trip_count += 1;
-                              worksum += i; // i should only be 0..directlen-1
-                              max_thread.max(threadIdx.x);
-                            });
+      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, directlen),
+                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, looplen)),
+      work_res,
+      [=] RAJA_DEVICE(RAJA::Index_type i, RAJA::Index_type RAJA_UNUSED_ARG(j))
+      {
+        trip_count += 1;
+        worksum += i;  // i should only be 0..directlen-1
+        max_thread.max(threadIdx.x);
+      });
 
   ASSERT_EQ(max_thread.get(), 255);
-  ASSERT_EQ(trip_count.get(), looplen*directlen);
-  ASSERT_EQ(worksum.get(), looplen*directlen*(directlen-1)/2);
+  ASSERT_EQ(trip_count.get(), looplen * directlen);
+  ASSERT_EQ(worksum.get(), looplen * directlen * (directlen - 1) / 2);
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
 void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI&,
                           const RAJA::Index_type directlen,
                           const RAJA::Index_type looplen)
 {
-  WORKING_RES work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+  WORKING_RES work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(directlen*looplen,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(directlen * looplen, erased_work_res,
+                                           &work_array, &check_array,
+                                           &test_array);
 
   RAJA::ReduceMax<REDUCE_POL, int> max_thread(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> trip_count(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
 
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
-                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, directlen), RAJA::TypedRangeSegment<RAJA::Index_type>(0, looplen)),
-                            RAJA::make_tuple((RAJA::Index_type)0, (RAJA::Index_type)0),
-                            work_res,
-                            [=] RAJA_DEVICE (RAJA::Index_type RAJA_UNUSED_ARG(i), RAJA::Index_type RAJA_UNUSED_ARG(j), RAJA::Index_type RAJA_UNUSED_ARG(x), RAJA::Index_type y) {
-                              trip_count += 1;
-                              worksum += y; // y should only be 0..3
-                              max_thread.max(threadIdx.x);
-                            });
+      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, directlen),
+                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, looplen)),
+      RAJA::make_tuple((RAJA::Index_type)0, (RAJA::Index_type)0), work_res,
+      [=] RAJA_DEVICE(RAJA::Index_type RAJA_UNUSED_ARG(i),
+                      RAJA::Index_type RAJA_UNUSED_ARG(j),
+                      RAJA::Index_type RAJA_UNUSED_ARG(x), RAJA::Index_type y)
+      {
+        trip_count += 1;
+        worksum += y;  // y should only be 0..3
+        max_thread.max(threadIdx.x);
+      });
 
   ASSERT_EQ(max_thread.get(), 255);
-  ASSERT_EQ(trip_count.get(), looplen*directlen);
-  ASSERT_EQ(worksum.get(), looplen*directlen*(looplen-1)/2);
+  ASSERT_EQ(trip_count.get(), looplen * directlen);
+  ASSERT_EQ(worksum.get(), looplen * directlen * (looplen - 1) / 2);
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
 //
@@ -147,37 +164,43 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI&,
 // Defining the Kernel Loop structure for ReduceMask Nested Loop Tests.
 //
 //
-template<typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
+template <typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
 struct WarpThreadExec;
 
 #if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPMASK, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPMASK,
+                      REDUCE_POL,
+                      POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::Lambda<0>>>>  // end DEVICE_KERNEL
+                         >;
 };
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::ForICount<0, RAJA::statement::Param<0>, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::ForICount<1, RAJA::statement::Param<1>, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI,
+                      REDUCE_POL,
+                      POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<RAJA::statement::ForICount<
+          0,
+          RAJA::statement::Param<0>,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::ForICount<
+              1,
+              RAJA::statement::Param<1>,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::Lambda<0>>>>  // end DEVICE_KERNEL
+      >;
 };
 
 #endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
diff --git a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp
index 1771b99665..e69c46baa5 100644
--- a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp
+++ b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp
@@ -10,22 +10,31 @@
 
 #include <numeric>
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_param_resource<EXEC_POL>( segs, params, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel_param(SEGMENTS&& segs,
+                  PARAMS&& params,
+                  WORKING_RES work_res,
+                  Args&&... args)
+{
+  RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args) {
-  RAJA::kernel_param<EXEC_POL>( segs, params, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel_param<EXEC_POL>(segs, params, args...);
 }
 
 //
@@ -33,67 +42,68 @@ typename std::enable_if< !USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs
 // Define list of nested loop types the ReduceWarp test supports.
 //
 //
-using ReduceWarpSupportedLoopTypeList = camp::list<
-  DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE,
-  DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE,
-  DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE
->;
+using ReduceWarpSupportedLoopTypeList =
+    camp::list<DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE,
+               DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE,
+               DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE>;
 
 //
 //
 // Sum of array of elements with GPU-specific policies.
 //
 //
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
 void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE&,
                           const RAJA::Index_type len)
 {
-  WORKING_RES work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+  WORKING_RES work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(len,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(len, erased_work_res, &work_array,
+                                           &check_array, &test_array);
 
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> reduce_count(0);
 
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
-                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, len)),
-                            RAJA::make_tuple((RAJA::Index_type)0),
-                            work_res,
+      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, len)),
+      RAJA::make_tuple((RAJA::Index_type)0), work_res,
 
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type i, RAJA::Index_type &value) {
-                              value += i;
-                            },
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type & value)
+      { value += i; },
 
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type &value) {
-                              // This only gets executed on the "root" thread which received the reduced value.
-                              worksum += value;
-                              reduce_count += 1;
-                            });
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type & value)
+      {
+        // This only gets executed on the "root" thread which received the
+        // reduced value.
+        worksum += value;
+        reduce_count += 1;
+      });
 
-  ASSERT_EQ(worksum.get(), len*(len-1)/2);
+  ASSERT_EQ(worksum.get(), len * (len - 1) / 2);
   ASSERT_EQ(reduce_count.get(), 1);
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
-void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE&,
-                          const RAJA::Index_type len) // len needs to be divisible by 10 and 16
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
+void KernelWarpThreadTest(
+    const DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE&,
+    const RAJA::Index_type len)  // len needs to be divisible by 10 and 16
 {
-  WORKING_RES work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+  WORKING_RES work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
@@ -102,88 +112,85 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE&,
   RAJA::Index_type innerlen = 10;
   RAJA::Index_type outerlen = len / innerlen;
 
-  allocateForallTestData<RAJA::Index_type>(len,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(len, erased_work_res, &work_array,
+                                           &check_array, &test_array);
 
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> reduce_count(0);
 
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
-                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, outerlen),
-                                             RAJA::TypedRangeSegment<RAJA::Index_type>(0, innerlen)),
-                            RAJA::make_tuple((RAJA::Index_type)0),
-                            work_res,
-
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type i, RAJA::Index_type j, RAJA::Index_type &value) {
-                              value += i + j * outerlen;
-                            },
-
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type &value) {
-                              // This only gets executed on the "root" thread which received the reduced value.
-                              worksum += value;
-                              reduce_count += 1;
-                            });
-
-  ASSERT_EQ(worksum.get(), outerlen*innerlen*(outerlen*innerlen-1)/2);
+      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, outerlen),
+                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, innerlen)),
+      RAJA::make_tuple((RAJA::Index_type)0), work_res,
+
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j,
+                           RAJA::Index_type & value)
+      { value += i + j * outerlen; },
+
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type & value)
+      {
+        // This only gets executed on the "root" thread which received the
+        // reduced value.
+        worksum += value;
+        reduce_count += 1;
+      });
+
+  ASSERT_EQ(worksum.get(), outerlen * innerlen * (outerlen * innerlen - 1) / 2);
   ASSERT_EQ(reduce_count.get(), innerlen);
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
-void KernelWarpThreadTest(const DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE&,
-                          const RAJA::Index_type len) // len needs to be divisible by 10 and 16
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
+void KernelWarpThreadTest(
+    const DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE&,
+    const RAJA::Index_type len)  // len needs to be divisible by 10 and 16
 {
-  WORKING_RES work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+  WORKING_RES work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  RAJA::Index_type innerlen = 10;
+  RAJA::Index_type innerlen  = 10;
   RAJA::Index_type middlelen = 16;
-  RAJA::Index_type outerlen = len / (innerlen*middlelen);
+  RAJA::Index_type outerlen  = len / (innerlen * middlelen);
 
-  allocateForallTestData<RAJA::Index_type>(len,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(len, erased_work_res, &work_array,
+                                           &check_array, &test_array);
 
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> reduce_count(0);
 
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
-                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, outerlen),
-                                             RAJA::TypedRangeSegment<RAJA::Index_type>(0, middlelen),
-                                             RAJA::TypedRangeSegment<RAJA::Index_type>(0, innerlen)),
-                            RAJA::make_tuple((RAJA::Index_type)0),
-                            work_res,
-
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type i, RAJA::Index_type j, RAJA::Index_type k, RAJA::Index_type &value) {
-                              value += i + j * outerlen + k * outerlen * middlelen;
-                            },
-
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type &value) {
-                              // This only gets executed on the "root" thread which received the reduced value.
-                              worksum += value;
-                              reduce_count += 1;
-                            });
-
-  ASSERT_EQ(worksum.get(), outerlen*middlelen*innerlen*(outerlen*middlelen*innerlen-1)/2);
-  ASSERT_EQ(reduce_count.get(), middlelen*innerlen);
-
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, outerlen),
+                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, middlelen),
+                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, innerlen)),
+      RAJA::make_tuple((RAJA::Index_type)0), work_res,
+
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j,
+                           RAJA::Index_type k, RAJA::Index_type & value)
+      { value += i + j * outerlen + k * outerlen * middlelen; },
+
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type & value)
+      {
+        // This only gets executed on the "root" thread which received the
+        // reduced value.
+        worksum += value;
+        reduce_count += 1;
+      });
+
+  ASSERT_EQ(worksum.get(), outerlen * middlelen * innerlen *
+                               (outerlen * middlelen * innerlen - 1) / 2);
+  ASSERT_EQ(reduce_count.get(), middlelen * innerlen);
+
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
 //
@@ -191,57 +198,75 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE&,
 // Defining the Kernel Loop structure for ReduceWarp Nested Loop Tests.
 //
 //
-template<typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
+template <typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
 struct WarpThreadExec;
 
 #if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type, RAJA::statement::Lambda<0>>,
-        RAJA::statement::Reduce<typename camp::at<POLICY_DATA, camp::num<1>>::type, RAJA::operators::plus, RAJA::statement::Param<0>,
-          RAJA::statement::Lambda<1, RAJA::Params<0>>
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE,
+                      REDUCE_POL,
+                      POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<
+      RAJA::statement::For<0,
+                           typename camp::at<POLICY_DATA, camp::num<0>>::type,
+                           RAJA::statement::Lambda<0>>,
+      RAJA::statement::Reduce<
+          typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::operators::plus,
+          RAJA::statement::Param<0>,
+          RAJA::statement::Lambda<1, RAJA::Params<0>>>>  // end DEVICE_KERNEL
+                                  >;
 };
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<1>>::type, RAJA::statement::Lambda<0>
-          >
-        >,
-        RAJA::statement::Reduce<typename camp::at<POLICY_DATA, camp::num<2>>::type, RAJA::operators::plus, RAJA::statement::Param<0>,
-          RAJA::statement::Lambda<1, RAJA::Params<0>>
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE,
+                      REDUCE_POL,
+                      POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<
+      RAJA::statement::For<
+          1,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              0,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::Lambda<0>>>,
+      RAJA::statement::Reduce<
+          typename camp::at<POLICY_DATA, camp::num<2>>::type,
+          RAJA::operators::plus,
+          RAJA::statement::Param<0>,
+          RAJA::statement::Lambda<1, RAJA::Params<0>>>>  // end DEVICE_KERNEL
+                                  >;
 };
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-            RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<2>>::type, RAJA::statement::Lambda<0>
-            > // end For 0
-          >,  // end For 1
-          typename camp::at<POLICY_DATA, camp::num<3>>::type, // warp synchronize
-          RAJA::statement::Reduce<typename camp::at<POLICY_DATA, camp::num<4>>::type, RAJA::operators::plus, RAJA::statement::Param<0>,
-            RAJA::statement::Lambda<1, RAJA::Params<0>>
-          >
-        > // end For 2
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE,
+                      REDUCE_POL,
+                      POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          2,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::For<
+                  0,
+                  typename camp::at<POLICY_DATA, camp::num<2>>::type,
+                  RAJA::statement::Lambda<0>>                  // end For 0
+              >,                                               // end For 1
+          typename camp::at<POLICY_DATA, camp::num<3>>::type,  // warp
+                                                               // synchronize
+          RAJA::statement::Reduce<
+              typename camp::at<POLICY_DATA, camp::num<4>>::type,
+              RAJA::operators::plus,
+              RAJA::statement::Param<0>,
+              RAJA::statement::Lambda<1, RAJA::Params<0>>>>  // end For 2
+                                                        >  // end DEVICE_KERNEL
+                         >;
 };
 
 #endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
diff --git a/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp b/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp
index ba4f445c88..d0a8e51af3 100644
--- a/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp
+++ b/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp
@@ -10,38 +10,53 @@
 
 #include <numeric>
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_resource<EXEC_POL>( segs, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
+{
+  RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args) {
-  RAJA::kernel<EXEC_POL>( segs, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel<EXEC_POL>(segs, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_param_resource<EXEC_POL>( segs, params, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel_param(SEGMENTS&& segs,
+                  PARAMS&& params,
+                  WORKING_RES work_res,
+                  Args&&... args)
+{
+  RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args) {
-  RAJA::kernel_param<EXEC_POL>( segs, params, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel_param<EXEC_POL>(segs, params, args...);
 }
 
 //
@@ -49,93 +64,96 @@ typename std::enable_if< !USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs
 // Define list of nested loop types the WarpLoop test supports.
 //
 //
-using WarpLoopSupportedLoopTypeList = camp::list<
-  DEVICE_DEPTH_1_REDUCESUM_WARP,
-  DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE,
-  DEVICE_DEPTH_2_REDUCESUM_WARP
->;
+using WarpLoopSupportedLoopTypeList =
+    camp::list<DEVICE_DEPTH_1_REDUCESUM_WARP,
+               DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE,
+               DEVICE_DEPTH_2_REDUCESUM_WARP>;
 
 //
 //
 // Sum of array of elements with GPU-specific policies.
 //
 //
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
 void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARP&,
                           const RAJA::Index_type len)
 {
-  WORKING_RES work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+  WORKING_RES work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(len,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(len, erased_work_res, &work_array,
+                                           &check_array, &test_array);
 
   RAJA::TypedRangeSegment<RAJA::Index_type> rangelen(0, len);
 
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
 
-  call_kernel<EXEC_POLICY, USE_RESOURCE>(RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, len)), work_res,
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type i) {
-                              worksum += i;
-                            });
+  call_kernel<EXEC_POLICY, USE_RESOURCE>(
+      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, len)),
+      work_res, [=] RAJA_HOST_DEVICE(RAJA::Index_type i) { worksum += i; });
 
-  ASSERT_EQ(worksum.get(), len*(len-1)/2);
+  ASSERT_EQ(worksum.get(), len * (len - 1) / 2);
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
 void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARP&,
                           const RAJA::Index_type numtiles)
 {
-  WORKING_RES work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+  WORKING_RES work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type flatSize = 32 * numtiles;
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(flatSize,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(
+      flatSize, erased_work_res, &work_array, &check_array, &test_array);
 
   RAJA::TypedRangeSegment<RAJA::Index_type> rangelen(0, flatSize);
 
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
 
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
-                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, flatSize)),
-                            RAJA::make_tuple((RAJA::Index_type)0),
-                            work_res,
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type RAJA_UNUSED_ARG(i), RAJA::Index_type j) {
-                              worksum += j; // j should only be 0..31
-                            });
-
-  ASSERT_EQ(worksum.get(), numtiles*32*(32-1)/2);
-
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, flatSize)),
+      RAJA::make_tuple((RAJA::Index_type)0), work_res,
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type RAJA_UNUSED_ARG(i),
+                           RAJA::Index_type j)
+      {
+        worksum += j;  // j should only be 0..31
+      });
+
+  ASSERT_EQ(worksum.get(), numtiles * 32 * (32 - 1) / 2);
+
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
-// More specific execution policies that use the above DEVICE_DEPTH_1_REDUCESUM_WARP test.
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE, typename... Args>
-void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE&, Args... args){
-  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(DEVICE_DEPTH_1_REDUCESUM_WARP(), args...);
+// More specific execution policies that use the above
+// DEVICE_DEPTH_1_REDUCESUM_WARP test.
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE&,
+                          Args... args)
+{
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
+      DEVICE_DEPTH_1_REDUCESUM_WARP(), args...);
 }
 
 //
@@ -143,49 +161,52 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE&, Args.
 // Defining the Kernel Loop structure for WarpLoop Nested Loop Tests.
 //
 //
-template<typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
+template <typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
 struct WarpThreadExec;
 
 #if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARP, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::Lambda<0>
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARP, REDUCE_POL, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<
+      RAJA::statement::For<0,
+                           typename camp::at<POLICY_DATA, camp::num<0>>::type,
+                           RAJA::statement::Lambda<0>>>  // end DEVICE_KERNEL
+                                  >;
 };
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::seq_exec,
-          RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE,
+                      REDUCE_POL,
+                      POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::Tile<
+          0,
+          RAJA::tile_fixed<32>,
+          RAJA::seq_exec,
+          RAJA::statement::For<
+              0,
+              typename camp::at<POLICY_DATA, camp::num<0>>::type,
+              RAJA::statement::Lambda<0>>>>  // end DEVICE_KERNEL
+                         >;
 };
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARP, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::seq_exec,
-          RAJA::statement::ForICount<0, RAJA::statement::Param<0>, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARP, REDUCE_POL, POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::Tile<
+          0,
+          RAJA::tile_fixed<32>,
+          RAJA::seq_exec,
+          RAJA::statement::ForICount<
+              0,
+              RAJA::statement::Param<0>,
+              typename camp::at<POLICY_DATA, camp::num<0>>::type,
+              RAJA::statement::Lambda<0>>>>  // end DEVICE_KERNEL
+                         >;
 };
 
 #endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
diff --git a/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp b/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp
index 04bc3bcc5e..c20b66a95d 100644
--- a/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp
+++ b/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp
@@ -21,11 +21,11 @@
 // Defining the Launch Loop structure for MultiReduce Nested Loop Tests.
 //
 //
-template <typename EXEC_POL_DATA, typename IDX_TYPE,
+template <typename EXEC_POL_DATA,
+          typename IDX_TYPE,
           typename SEGMENTS_TYPE,
           typename Lambda>
-void Launch(const SEGMENTS_TYPE& segments,
-                  Lambda&& lambda)
+void Launch(const SEGMENTS_TYPE& segments, Lambda&& lambda)
 {
   using RAJA::get;
 
@@ -55,41 +55,69 @@ void Launch(const SEGMENTS_TYPE& segments,
   IDX_TYPE blocks_j = RAJA_DIVIDE_CEILING_INT(distance_sj, threads_j);
   IDX_TYPE blocks_k = RAJA_DIVIDE_CEILING_INT(distance_sk, threads_k);
 
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks_i, blocks_j, blocks_k),
-                        RAJA::Threads(threads_i, threads_j,threads_k)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-    RAJA::loop<TEAM_Z_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_k), [&](IDX_TYPE bk) {
-      RAJA::loop<TEAM_Y_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_j), [&](IDX_TYPE bj) {
-        RAJA::loop<TEAM_X_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_i), [&](IDX_TYPE bi) {
-
-          RAJA::loop<THREAD_Z_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, threads_k), [&](IDX_TYPE tk) {
-            RAJA::loop<THREAD_Y_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, threads_j), [&](IDX_TYPE tj) {
-              RAJA::loop<THREAD_X_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, threads_i), [&](IDX_TYPE ti) {
-
-                IDX_TYPE i = ti + threads_i * bi;
-                IDX_TYPE j = tj + threads_j * bj;
-                IDX_TYPE k = tk + threads_k * bk;
-
-                if (i < distance_si && j < distance_sj && k < distance_sk) {
-                  lambda(begin_sk[k], begin_sj[j], begin_si[i]);
-                }
-              });
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks_i, blocks_j, blocks_k),
+                         RAJA::Threads(threads_i, threads_j, threads_k)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<TEAM_Z_POLICY>(
+            ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_k),
+            [&](IDX_TYPE bk)
+            {
+              RAJA::loop<TEAM_Y_POLICY>(
+                  ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_j),
+                  [&](IDX_TYPE bj)
+                  {
+                    RAJA::loop<TEAM_X_POLICY>(
+                        ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_i),
+                        [&](IDX_TYPE bi)
+                        {
+                          RAJA::loop<THREAD_Z_POLICY>(
+                              ctx,
+                              RAJA::TypedRangeSegment<IDX_TYPE>(0, threads_k),
+                              [&](IDX_TYPE tk)
+                              {
+                                RAJA::loop<THREAD_Y_POLICY>(
+                                    ctx,
+                                    RAJA::TypedRangeSegment<IDX_TYPE>(
+                                        0, threads_j),
+                                    [&](IDX_TYPE tj)
+                                    {
+                                      RAJA::loop<THREAD_X_POLICY>(
+                                          ctx,
+                                          RAJA::TypedRangeSegment<IDX_TYPE>(
+                                              0, threads_i),
+                                          [&](IDX_TYPE ti)
+                                          {
+                                            IDX_TYPE i = ti + threads_i * bi;
+                                            IDX_TYPE j = tj + threads_j * bj;
+                                            IDX_TYPE k = tk + threads_k * bk;
+
+                                            if (i < distance_si &&
+                                                j < distance_sj &&
+                                                k < distance_sk)
+                                            {
+                                              lambda(begin_sk[k], begin_sj[j],
+                                                     begin_si[i]);
+                                            }
+                                          });
+                                    });
+                              });
+                        });
+                  });
             });
-          });
-
-        });
       });
-    });
-
-  });
 }
 
-template <typename EXEC_POL_DATA, typename REDUCE_POLICY, typename ABSTRACTION,
-          typename DATA_TYPE, typename IDX_TYPE,
-          typename SEGMENTS_TYPE, typename Container,
-          typename WORKING_RES, typename RandomGenerator>
+template <typename EXEC_POL_DATA,
+          typename REDUCE_POLICY,
+          typename ABSTRACTION,
+          typename DATA_TYPE,
+          typename IDX_TYPE,
+          typename SEGMENTS_TYPE,
+          typename Container,
+          typename WORKING_RES,
+          typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 // add bool return type to disambiguate signatures of these functions for MSVC
 std::enable_if_t<!ABSTRACTION::template supports<DATA_TYPE>(), bool>
@@ -97,12 +125,19 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE&,
                                 const Container&,
                                 WORKING_RES,
                                 RandomGenerator&)
-{ return false; }
+{
+  return false;
+}
 ///
-template <typename EXEC_POL_DATA, typename REDUCE_POLICY, typename ABSTRACTION,
-          typename DATA_TYPE, typename IDX_TYPE,
-          typename SEGMENTS_TYPE, typename Container,
-          typename WORKING_RES, typename RandomGenerator>
+template <typename EXEC_POL_DATA,
+          typename REDUCE_POLICY,
+          typename ABSTRACTION,
+          typename DATA_TYPE,
+          typename IDX_TYPE,
+          typename SEGMENTS_TYPE,
+          typename Container,
+          typename WORKING_RES,
+          typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 std::enable_if_t<ABSTRACTION::template supports<DATA_TYPE>()>
 LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
@@ -111,7 +146,8 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
                                 RandomGenerator& rngen)
 {
   using RAJA::get;
-  using MULTIREDUCER = typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
+  using MULTIREDUCER =
+      typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
 
   auto si = get<2>(segments);
   auto sj = get<1>(segments);
@@ -121,13 +157,13 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
   RAJA_EXTRACT_BED_SUFFIXED(sj, _sj);
   RAJA_EXTRACT_BED_SUFFIXED(sk, _sk);
 
-  IDX_TYPE dimi = begin_si[distance_si-1] + 1;
-  IDX_TYPE dimj = begin_sj[distance_sj-1] + 1;
-  IDX_TYPE dimk = begin_sk[distance_sk-1] + 1;
+  IDX_TYPE dimi = begin_si[distance_si - 1] + 1;
+  IDX_TYPE dimj = begin_sj[distance_sj - 1] + 1;
+  IDX_TYPE dimk = begin_sk[distance_sk - 1] + 1;
 
   const IDX_TYPE idx_range = dimi * dimj * dimk;
 
-  const int modval = 100;
+  const int modval      = 100;
   const size_t num_bins = multi_init.size();
 
   IDX_TYPE* working_range;
@@ -144,51 +180,50 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
 
   IDX_TYPE data_len = 0;
 
-  allocateForallTestData(idx_range+1,
-                         working_res,
-                         &working_range,
-                         &check_range,
-                         &test_range);
+  allocateForallTestData(idx_range + 1, working_res, &working_range,
+                         &check_range, &test_range);
 
-  for (IDX_TYPE i = 0; i < idx_range+1; ++i) {
+  for (IDX_TYPE i = 0; i < idx_range + 1; ++i)
+  {
     test_range[i] = ~IDX_TYPE(0);
   }
 
   {
-    std::uniform_int_distribution<IDX_TYPE> work_per_iterate_distribution(0, num_bins);
-
-    for (IDX_TYPE k : sk) {
-      for (IDX_TYPE j : sj) {
-        for (IDX_TYPE i : si) {
-          IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+    std::uniform_int_distribution<IDX_TYPE> work_per_iterate_distribution(
+        0, num_bins);
+
+    for (IDX_TYPE k : sk)
+    {
+      for (IDX_TYPE j : sj)
+      {
+        for (IDX_TYPE i : si)
+        {
+          IDX_TYPE ii    = (dimi * dimj * k) + (dimi * j) + i;
           test_range[ii] = data_len;
           data_len += work_per_iterate_distribution(rngen);
-          test_range[ii+1] = data_len;
+          test_range[ii + 1] = data_len;
         }
       }
     }
   }
 
-  allocateForallTestData(data_len,
-                         working_res,
-                         &working_array,
-                         &check_array,
+  allocateForallTestData(data_len, working_res, &working_array, &check_array,
                          &test_array);
 
-  allocateForallTestData(data_len,
-                         working_res,
-                         &working_bins,
-                         &check_bins,
+  allocateForallTestData(data_len, working_res, &working_bins, &check_bins,
                          &test_bins);
 
-  if (data_len > IDX_TYPE(0)) {
+  if (data_len > IDX_TYPE(0))
+  {
 
-    // use ints to initialize array here to avoid floating point precision issues
-    std::uniform_int_distribution<int> array_int_distribution(0, modval-1);
-    std::uniform_int_distribution<IDX_TYPE> bin_distribution(0, num_bins-1);
+    // use ints to initialize array here to avoid floating point precision
+    // issues
+    std::uniform_int_distribution<int> array_int_distribution(0, modval - 1);
+    std::uniform_int_distribution<IDX_TYPE> bin_distribution(0, num_bins - 1);
 
 
-    for (IDX_TYPE i = 0; i < data_len; ++i) {
+    for (IDX_TYPE i = 0; i < data_len; ++i)
+    {
       test_array[i] = DATA_TYPE(array_int_distribution(rngen));
 
       // this may use the same bin multiple times per iterate
@@ -196,7 +231,8 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
     }
   }
 
-  working_res.memcpy(working_range, test_range, sizeof(IDX_TYPE) * (idx_range+1));
+  working_res.memcpy(working_range, test_range,
+                     sizeof(IDX_TYPE) * (idx_range + 1));
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
   working_res.memcpy(working_bins, test_bins, sizeof(IDX_TYPE) * data_len);
 
@@ -208,21 +244,28 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
   {
     std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
 
-    for (IDX_TYPE i = 0; i < data_len; ++i) {
-      ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+    for (IDX_TYPE i = 0; i < data_len; ++i)
+    {
+      ref_vals[test_bins[i]] =
+          ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
     }
 
-    Launch<EXEC_POL_DATA, IDX_TYPE>(segments,
-        [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
-      IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
-      for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
-        ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
-        ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]);
-      }
-    });
+    Launch<EXEC_POL_DATA, IDX_TYPE>(
+        segments,
+        [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i)
+        {
+          IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+          for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
+               ++idx)
+          {
+            ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
+            ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]);
+          }
+        });
 
     size_t bin = 0;
-    for (auto init_val : multi_init) {
+    for (auto init_val : multi_init)
+    {
       ASSERT_EQ(DATA_TYPE(red[bin].get()), ref_vals[bin]);
       ASSERT_EQ(red2.get(bin), ABSTRACTION::combine(ref_vals[bin], init_val));
       ++bin;
@@ -232,46 +275,60 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
 
   red.reset();
 
-  // basic multiple use test, ensure same reducer can combine values from multiple loops
+  // basic multiple use test, ensure same reducer can combine values from
+  // multiple loops
   {
     std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
 
     const int nloops = 2;
-    for (int j = 0; j < nloops; ++j) {
+    for (int j = 0; j < nloops; ++j)
+    {
 
-      for (IDX_TYPE i = 0; i < data_len; ++i) {
-        ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+      for (IDX_TYPE i = 0; i < data_len; ++i)
+      {
+        ref_vals[test_bins[i]] =
+            ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
       }
 
-      Launch<EXEC_POL_DATA, IDX_TYPE>(segments,
-          [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
-        IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
-        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
-          ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
-        }
-      });
+      Launch<EXEC_POL_DATA, IDX_TYPE>(
+          segments,
+          [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i)
+          {
+            IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+            for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
+                 ++idx)
+            {
+              ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
+            }
+          });
     }
 
-    for (size_t bin = 0; bin < num_bins; ++bin) {
+    for (size_t bin = 0; bin < num_bins; ++bin)
+    {
       ASSERT_EQ(static_cast<DATA_TYPE>(red[bin].get()), ref_vals[bin]);
     }
   }
 
 
   // test the consistency of answers, if we expect them to be consistent
-  if (ABSTRACTION::consistent(red)) {
+  if (ABSTRACTION::consistent(red))
+  {
 
-    if /* constexpr */ (std::is_floating_point<DATA_TYPE>::value) {
+    if /* constexpr */ (std::is_floating_point<DATA_TYPE>::value)
+    {
 
       // use floating point values to accentuate floating point precision issues
       std::conditional_t<!std::is_floating_point<DATA_TYPE>::value,
-          std::uniform_int_distribution<DATA_TYPE>,
-          std::uniform_real_distribution<DATA_TYPE>> array_flt_distribution(0, modval-1);
+                         std::uniform_int_distribution<DATA_TYPE>,
+                         std::uniform_real_distribution<DATA_TYPE>>
+          array_flt_distribution(0, modval - 1);
 
-      for (IDX_TYPE i = 0; i < data_len; ++i) {
+      for (IDX_TYPE i = 0; i < data_len; ++i)
+      {
         test_array[i] = DATA_TYPE(array_flt_distribution(rngen));
       }
-      working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
+      working_res.memcpy(working_array, test_array,
+                         sizeof(DATA_TYPE) * data_len);
     }
 
 
@@ -279,23 +336,32 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
     bool got_ref_vals = false;
 
     const int nloops = 2;
-    for (int j = 0; j < nloops; ++j) {
+    for (int j = 0; j < nloops; ++j)
+    {
       red.reset();
 
-      Launch<EXEC_POL_DATA, IDX_TYPE>(segments,
-          [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
-        IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
-        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
-          ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
-        }
-      });
+      Launch<EXEC_POL_DATA, IDX_TYPE>(
+          segments,
+          [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i)
+          {
+            IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+            for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
+                 ++idx)
+            {
+              ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
+            }
+          });
 
-      if (!got_ref_vals) {
+      if (!got_ref_vals)
+      {
         ref_vals.resize(num_bins);
         red.get_all(ref_vals);
         got_ref_vals = true;
-      } else {
-        for (size_t bin = 0; bin < num_bins; ++bin) {
+      }
+      else
+      {
+        for (size_t bin = 0; bin < num_bins; ++bin)
+        {
           ASSERT_EQ(red.get(bin), ref_vals[bin]);
         }
       }
@@ -303,26 +369,16 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
   }
 
 
-  deallocateForallTestData(working_res,
-                           working_bins,
-                           check_bins,
-                           test_bins);
-  deallocateForallTestData(working_res,
-                           working_array,
-                           check_array,
-                           test_array);
-  deallocateForallTestData(working_res,
-                           working_range,
-                           check_range,
-                           test_range);
+  deallocateForallTestData(working_res, working_bins, check_bins, test_bins);
+  deallocateForallTestData(working_res, working_array, check_array, test_array);
+  deallocateForallTestData(working_res, working_range, check_range, test_range);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchMultiReduceNestedTest);
 template <typename T>
 class LaunchMultiReduceNestedTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchMultiReduceNestedTest, MultiReduceNestedLaunch)
 {
@@ -334,43 +390,48 @@ TYPED_TEST_P(LaunchMultiReduceNestedTest, MultiReduceNestedLaunch)
   using ABSTRACTION   = typename camp::at<TypeParam, camp::num<5>>::type;
 
   // for setting random values in arrays
-  auto random_seed = std::random_device{}();
+  auto random_seed = std::random_device {}();
   std::mt19937 rngen(random_seed);
 
-  WORKING_RES working_res{WORKING_RES::get_default()};
+  WORKING_RES working_res {WORKING_RES::get_default()};
 
   std::vector<DATA_TYPE> container;
 
   std::vector<size_t> num_bins_max_container({0, 1, 100});
   size_t num_bins_min = 0;
-  for (size_t num_bins_max : num_bins_max_container) {
+  for (size_t num_bins_max : num_bins_max_container)
+  {
 
-    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min, num_bins_max);
-    num_bins_min = num_bins_max+1;
+    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min,
+                                                        num_bins_max);
+    num_bins_min    = num_bins_max + 1;
     size_t num_bins = num_bins_dist(rngen);
 
     container.resize(num_bins, DATA_TYPE(2));
 
     // Range segment tests
-    auto s1 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>( 0, 2 ),
-                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 7 ),
-                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 3 ));
-    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
-                                   s1, container, working_res, rngen);
-
-    auto s2 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>( 2, 35 ),
-                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 19 ),
-                               RAJA::TypedRangeSegment<IDX_TYPE>( 3, 13 ));
-    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
-                                   s2, container, working_res, rngen);
+    auto s1 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, 2),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 7),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 3));
+    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION,
+                                    DATA_TYPE, IDX_TYPE>(s1, container,
+                                                         working_res, rngen);
+
+    auto s2 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(2, 35),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 19),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(3, 13));
+    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION,
+                                    DATA_TYPE, IDX_TYPE>(s2, container,
+                                                         working_res, rngen);
 
     // Range-stride segment tests
-    auto s3 = RAJA::make_tuple(RAJA::TypedRangeStrideSegment<IDX_TYPE>( 0, 6, 2 ),
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>( 1, 38, 3 ),
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>( 5, 17, 1 ));
-    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
-                                   s3, container, working_res, rngen);
-
+    auto s3 =
+        RAJA::make_tuple(RAJA::TypedRangeStrideSegment<IDX_TYPE>(0, 6, 2),
+                         RAJA::TypedRangeStrideSegment<IDX_TYPE>(1, 38, 3),
+                         RAJA::TypedRangeStrideSegment<IDX_TYPE>(5, 17, 1));
+    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION,
+                                    DATA_TYPE, IDX_TYPE>(s3, container,
+                                                         working_res, rngen);
   }
 }
 
diff --git a/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp b/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
index a730d030a7..bb64d5424b 100644
--- a/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
+++ b/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
@@ -10,19 +10,25 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY, typename THREAD_Y_POLICY, typename THREAD_Z_POLICY,
-          typename TEAM_X_POLICY, typename TEAM_Y_POLICY, typename TEAM_Z_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY,
+          typename THREAD_Y_POLICY,
+          typename THREAD_Z_POLICY,
+          typename TEAM_X_POLICY,
+          typename TEAM_Y_POLICY,
+          typename TEAM_Z_POLICY>
 void LaunchNestedDirectTestImpl(INDEX_TYPE M)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, 2*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, 3*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, 4*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, 2 * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, 3 * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, 4 * M);
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r4(0, 4*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r5(0, 5*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r6(0, 6*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r4(0, 4 * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r5(0, 5 * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r6(0, 6 * M);
 
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
   INDEX_TYPE N2 = static_cast<INDEX_TYPE>(r2.end() - r2.begin());
@@ -32,26 +38,22 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
   INDEX_TYPE N5 = static_cast<INDEX_TYPE>(r5.end() - r5.begin());
   INDEX_TYPE N6 = static_cast<INDEX_TYPE>(r6.end() - r6.begin());
 
-  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 *                                          
-                                         N3 * N4 *
-                                         N5 * N6);                                         
+  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 * N3 * N4 * N5 * N6);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
-  //6 threads total
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
+  // 6 threads total
   constexpr int threads_x = 2;
   constexpr int threads_y = 3;
   constexpr int threads_z = 4;
@@ -60,85 +62,132 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
   constexpr int blocks_y = 5;
   constexpr int blocks_z = 6;
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
 
     constexpr int DIM = 6;
-    using layout_t = RAJA::Layout<DIM, INDEX_TYPE,DIM-1>;
-    RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N6, N5, N4, N3, N2, N1);
-
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(threads_x, threads_y,threads_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<TEAM_Z_POLICY>(ctx, r6, [&](INDEX_TYPE bz) {
-            RAJA::loop<TEAM_Y_POLICY>(ctx, r5, [&](INDEX_TYPE by) {
-                RAJA::loop<TEAM_X_POLICY>(ctx, r4, [&](INDEX_TYPE bx) {
-
-                    RAJA::loop<THREAD_Z_POLICY>(ctx, r3, [&](INDEX_TYPE tz) {
-                        RAJA::loop<THREAD_Y_POLICY>(ctx, r2, [&](INDEX_TYPE ty) {
-                            RAJA::loop<THREAD_X_POLICY>(ctx, r1, [&](INDEX_TYPE tx) {
-
-                                auto idx = tx + N1 * (ty + N2 * (tz + N3 * (bx + N4 * (by + N5 * bz))));
-
-
-                                Aview(bz, by, bx, tz, ty, tx) = static_cast<INDEX_TYPE>(idx);
-                              });
+    using layout_t    = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
+    RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N6, N5, N4, N3, N2,
+                                           N1);
+
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(threads_x, threads_y, threads_z)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<TEAM_Z_POLICY>(
+              ctx, r6,
+              [&](INDEX_TYPE bz)
+              {
+                RAJA::loop<TEAM_Y_POLICY>(
+                    ctx, r5,
+                    [&](INDEX_TYPE by)
+                    {
+                      RAJA::loop<TEAM_X_POLICY>(
+                          ctx, r4,
+                          [&](INDEX_TYPE bx)
+                          {
+                            RAJA::loop<THREAD_Z_POLICY>(
+                                ctx, r3,
+                                [&](INDEX_TYPE tz)
+                                {
+                                  RAJA::loop<THREAD_Y_POLICY>(
+                                      ctx, r2,
+                                      [&](INDEX_TYPE ty)
+                                      {
+                                        RAJA::loop<THREAD_X_POLICY>(
+                                            ctx, r1,
+                                            [&](INDEX_TYPE tx)
+                                            {
+                                              auto idx =
+                                                  tx +
+                                                  N1 *
+                                                      (ty +
+                                                       N2 *
+                                                           (tz +
+                                                            N3 *
+                                                                (bx +
+                                                                 N4 *
+                                                                     (by +
+                                                                      N5 *
+                                                                          bz))));
+
+
+                                              Aview(bz, by, bx, tz, ty, tx) =
+                                                  static_cast<INDEX_TYPE>(idx);
+                                            });
+                                      });
+                                });
                           });
-                      });
-
-                  });
+                    });
               });
-          });
-    });
-  } else { // zero-length segment
+        });
+  }
+  else
+  {  // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
-
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(blocks_x, blocks_y ,blocks_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<TEAM_Z_POLICY>(ctx, r3, [&](INDEX_TYPE RAJA_UNUSED_ARG(bz)) {
-            RAJA::loop<TEAM_Y_POLICY>(ctx, r2, [&](INDEX_TYPE RAJA_UNUSED_ARG(by)) {
-                RAJA::loop<TEAM_X_POLICY>(ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(bx)) {
-
-                    RAJA::loop<THREAD_Z_POLICY>(ctx, r3, [&](INDEX_TYPE RAJA_UNUSED_ARG(tz)) {
-                        RAJA::loop<THREAD_Y_POLICY>(ctx, r2, [&](INDEX_TYPE RAJA_UNUSED_ARG(ty)) {
-                            RAJA::loop<THREAD_X_POLICY>(ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(tx)) {
-
-                                working_array[0]++;
-                                
-                              });
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
+
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(blocks_x, blocks_y, blocks_z)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<TEAM_Z_POLICY>(
+              ctx, r3,
+              [&](INDEX_TYPE RAJA_UNUSED_ARG(bz))
+              {
+                RAJA::loop<TEAM_Y_POLICY>(
+                    ctx, r2,
+                    [&](INDEX_TYPE RAJA_UNUSED_ARG(by))
+                    {
+                      RAJA::loop<TEAM_X_POLICY>(
+                          ctx, r1,
+                          [&](INDEX_TYPE RAJA_UNUSED_ARG(bx))
+                          {
+                            RAJA::loop<THREAD_Z_POLICY>(
+                                ctx, r3,
+                                [&](INDEX_TYPE RAJA_UNUSED_ARG(tz))
+                                {
+                                  RAJA::loop<THREAD_Y_POLICY>(
+                                      ctx, r2,
+                                      [&](INDEX_TYPE RAJA_UNUSED_ARG(ty))
+                                      {
+                                        RAJA::loop<THREAD_X_POLICY>(
+                                            ctx, r1,
+                                            [&](INDEX_TYPE RAJA_UNUSED_ARG(tx))
+                                            { working_array[0]++; });
+                                      });
+                                });
                           });
-                      });
-
-                  });
+                    });
               });
-          });
-      });
+        });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0) {
-    
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  if (RAJA::stripIndexType(N) > 0)
+  {
+
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+    {
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+                check_array[RAJA::stripIndexType(i)]);
     }
-    
-  } else {
-    
+  }
+  else
+  {
+
     ASSERT_EQ(test_array[0], check_array[0]);
-    
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -146,8 +195,7 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
 TYPED_TEST_SUITE_P(LaunchNestedDirectTest);
 template <typename T>
 class LaunchNestedDirectTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(LaunchNestedDirectTest, RangeSegmentTeams)
@@ -155,34 +203,44 @@ TYPED_TEST_P(LaunchNestedDirectTest, RangeSegmentTeams)
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-
-  using TEAM_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using TEAM_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
-  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<3>>::type;
-
-  using THREAD_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<4>>::type;
-  using THREAD_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<5>>::type;
-  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<6>>::type;
-
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+
+  using TEAM_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using TEAM_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
+  using TEAM_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<3>>::type;
+
+  using THREAD_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<4>>::type;
+  using THREAD_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<5>>::type;
+  using THREAD_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<6>>::type;
 
 
   // test zero-length range segment
   LaunchNestedDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
-    (INDEX_TYPE(0));
+                             THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
+                             TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+      INDEX_TYPE(0));
 
-  //Keep at one since we are doing a direct thread test
+  // Keep at one since we are doing a direct thread test
   LaunchNestedDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
-    (INDEX_TYPE(1));
-
-
+                             THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
+                             TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+      INDEX_TYPE(1));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchNestedDirectTest,
-                            RangeSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedDirectTest, RangeSegmentTeams);
 
 #endif  // __TEST_LAUNCH_NESTED_DIRECT_HPP__
diff --git a/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp b/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
index 8f3b9702d0..c9192b6718 100644
--- a/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
+++ b/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
@@ -10,19 +10,25 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY, typename THREAD_Y_POLICY, typename THREAD_Z_POLICY,
-          typename TEAM_X_POLICY, typename TEAM_Y_POLICY, typename TEAM_Z_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY,
+          typename THREAD_Y_POLICY,
+          typename THREAD_Z_POLICY,
+          typename TEAM_X_POLICY,
+          typename TEAM_Y_POLICY,
+          typename TEAM_Z_POLICY>
 void LaunchNestedLoopTestImpl(INDEX_TYPE M)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, 2*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, 3*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, 4*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, 2 * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, 3 * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, 4 * M);
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r4(0, 8*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r5(0, 2*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r6(0, 3*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r4(0, 8 * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r5(0, 2 * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r6(0, 3 * M);
 
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
   INDEX_TYPE N2 = static_cast<INDEX_TYPE>(r2.end() - r2.begin());
@@ -33,30 +39,23 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
   INDEX_TYPE N6 = static_cast<INDEX_TYPE>(r6.end() - r6.begin());
 
 
-  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 *
-                                         N2 *
-                                         N3 *
-                                         N4 *
-                                         N5 *
-                                         N6);
+  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 * N3 * N4 * N5 * N6);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  //6 threads total
+  // 6 threads total
   constexpr int threads_x = 1;
   constexpr int threads_y = 2;
   constexpr int threads_z = 3;
@@ -65,86 +64,132 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
   constexpr int blocks_y = 2;
   constexpr int blocks_z = 1;
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
 
     constexpr int DIM = 6;
-    using layout_t = RAJA::Layout<DIM, INDEX_TYPE,DIM-1>;
-    RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N6, N5, N4, N3, N2, N1);
-
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(threads_x, threads_y,threads_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<TEAM_Z_POLICY>(ctx, r6, [&](INDEX_TYPE bz) {
-            RAJA::loop<TEAM_Y_POLICY>(ctx, r5, [&](INDEX_TYPE by) {
-                RAJA::loop<TEAM_X_POLICY>(ctx, r4, [&](INDEX_TYPE bx) {
-
-                    RAJA::loop<THREAD_Z_POLICY>(ctx, r3, [&](INDEX_TYPE tz) {
-                        RAJA::loop<THREAD_Y_POLICY>(ctx, r2, [&](INDEX_TYPE ty) {
-                            RAJA::loop<THREAD_X_POLICY>(ctx, r1, [&](INDEX_TYPE tx) {
-
-                                auto idx = tx + N1 * (ty + N2 * (tz + N3 * (bx + N4 * (by + N5 * bz))));
-
-
-                                Aview(bz, by, bx, tz, ty, tx) = static_cast<INDEX_TYPE>(idx);
-                                
-                              });
+    using layout_t    = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
+    RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N6, N5, N4, N3, N2,
+                                           N1);
+
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(threads_x, threads_y, threads_z)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<TEAM_Z_POLICY>(
+              ctx, r6,
+              [&](INDEX_TYPE bz)
+              {
+                RAJA::loop<TEAM_Y_POLICY>(
+                    ctx, r5,
+                    [&](INDEX_TYPE by)
+                    {
+                      RAJA::loop<TEAM_X_POLICY>(
+                          ctx, r4,
+                          [&](INDEX_TYPE bx)
+                          {
+                            RAJA::loop<THREAD_Z_POLICY>(
+                                ctx, r3,
+                                [&](INDEX_TYPE tz)
+                                {
+                                  RAJA::loop<THREAD_Y_POLICY>(
+                                      ctx, r2,
+                                      [&](INDEX_TYPE ty)
+                                      {
+                                        RAJA::loop<THREAD_X_POLICY>(
+                                            ctx, r1,
+                                            [&](INDEX_TYPE tx)
+                                            {
+                                              auto idx =
+                                                  tx +
+                                                  N1 *
+                                                      (ty +
+                                                       N2 *
+                                                           (tz +
+                                                            N3 *
+                                                                (bx +
+                                                                 N4 *
+                                                                     (by +
+                                                                      N5 *
+                                                                          bz))));
+
+
+                                              Aview(bz, by, bx, tz, ty, tx) =
+                                                  static_cast<INDEX_TYPE>(idx);
+                                            });
+                                      });
+                                });
                           });
-                      });
-
-                  });
+                    });
               });
-          });
-    });
-  } else { // zero-length segment
+        });
+  }
+  else
+  {  // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
-
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(blocks_x, blocks_y ,blocks_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<TEAM_Z_POLICY>(ctx, r3, [&](INDEX_TYPE RAJA_UNUSED_ARG(bz)) {
-            RAJA::loop<TEAM_Y_POLICY>(ctx, r2, [&](INDEX_TYPE RAJA_UNUSED_ARG(by)) {
-                RAJA::loop<TEAM_X_POLICY>(ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(bx)) {
-
-                    RAJA::loop<THREAD_Z_POLICY>(ctx, r3, [&](INDEX_TYPE RAJA_UNUSED_ARG(tz)) {
-                        RAJA::loop<THREAD_Y_POLICY>(ctx, r2, [&](INDEX_TYPE RAJA_UNUSED_ARG(ty)) {
-                            RAJA::loop<THREAD_X_POLICY>(ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(tx) ) {
-
-                                working_array[0]++;
-                                
-                              });
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
+
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(blocks_x, blocks_y, blocks_z)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<TEAM_Z_POLICY>(
+              ctx, r3,
+              [&](INDEX_TYPE RAJA_UNUSED_ARG(bz))
+              {
+                RAJA::loop<TEAM_Y_POLICY>(
+                    ctx, r2,
+                    [&](INDEX_TYPE RAJA_UNUSED_ARG(by))
+                    {
+                      RAJA::loop<TEAM_X_POLICY>(
+                          ctx, r1,
+                          [&](INDEX_TYPE RAJA_UNUSED_ARG(bx))
+                          {
+                            RAJA::loop<THREAD_Z_POLICY>(
+                                ctx, r3,
+                                [&](INDEX_TYPE RAJA_UNUSED_ARG(tz))
+                                {
+                                  RAJA::loop<THREAD_Y_POLICY>(
+                                      ctx, r2,
+                                      [&](INDEX_TYPE RAJA_UNUSED_ARG(ty))
+                                      {
+                                        RAJA::loop<THREAD_X_POLICY>(
+                                            ctx, r1,
+                                            [&](INDEX_TYPE RAJA_UNUSED_ARG(tx))
+                                            { working_array[0]++; });
+                                      });
+                                });
                           });
-                      });
-
-                  });
+                    });
               });
-          });
-      });
+        });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+    {
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+                check_array[RAJA::stripIndexType(i)]);
     }
-
-  } else {
+  }
+  else
+  {
 
     ASSERT_EQ(test_array[0], check_array[0]);
-
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -152,8 +197,7 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
 TYPED_TEST_SUITE_P(LaunchNestedLoopTest);
 template <typename T>
 class LaunchNestedLoopTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(LaunchNestedLoopTest, RangeSegmentTeams)
@@ -161,32 +205,43 @@ TYPED_TEST_P(LaunchNestedLoopTest, RangeSegmentTeams)
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-
-  using TEAM_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using TEAM_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
-  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<3>>::type;
-
-  using THREAD_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<4>>::type;
-  using THREAD_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<5>>::type;
-  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<6>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+
+  using TEAM_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using TEAM_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
+  using TEAM_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<3>>::type;
+
+  using THREAD_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<4>>::type;
+  using THREAD_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<5>>::type;
+  using THREAD_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<6>>::type;
 
 
   // test zero-length range segment
   LaunchNestedLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
                            THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
-    (INDEX_TYPE(0));
+                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+      INDEX_TYPE(0));
 
   LaunchNestedLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
                            THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
-    (INDEX_TYPE(3));
-
-
+                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+      INDEX_TYPE(3));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchNestedLoopTest,
-                            RangeSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedLoopTest, RangeSegmentTeams);
 
 #endif  // __TEST_LAUNCH_NESTED_LOOP_HPP__
diff --git a/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp b/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
index 793d432987..20a4e10ac6 100644
--- a/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
+++ b/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
@@ -10,9 +10,15 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY, typename THREAD_Y_POLICY, typename THREAD_Z_POLICY,
-          typename TEAM_X_POLICY, typename TEAM_Y_POLICY, typename TEAM_Z_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY,
+          typename THREAD_Y_POLICY,
+          typename THREAD_Z_POLICY,
+          typename TEAM_X_POLICY,
+          typename TEAM_Y_POLICY,
+          typename TEAM_Z_POLICY>
 void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
 {
 
@@ -20,121 +26,152 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
   constexpr int tile_size_y = 3;
   constexpr int tile_size_z = 4;
 
-  constexpr int threads_x = 2*tile_size_x;
-  constexpr int threads_y = 3*tile_size_y;
-  constexpr int threads_z = 4*tile_size_z;
+  constexpr int threads_x = 2 * tile_size_x;
+  constexpr int threads_y = 3 * tile_size_y;
+  constexpr int threads_z = 4 * tile_size_z;
 
   constexpr int blocks_x = 4;
   constexpr int blocks_y = 5;
   constexpr int blocks_z = 6;
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, tile_size_x*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, tile_size_y*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, tile_size_z*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, tile_size_x * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, tile_size_y * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, tile_size_z * M);
 
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
   INDEX_TYPE N2 = static_cast<INDEX_TYPE>(r2.end() - r2.begin());
   INDEX_TYPE N3 = static_cast<INDEX_TYPE>(r3.end() - r3.begin());
 
-  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 *
-                                         N2 *
-                                         N3);
+  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 * N3);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
 
     constexpr int DIM = 3;
-    using layout_t = RAJA::Layout<DIM, INDEX_TYPE,DIM-1>;
+    using layout_t    = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
     RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N3, N2, N1);
 
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(threads_x, threads_y,threads_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::tile<TEAM_Z_POLICY>(ctx, tile_size_z, r3, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &z_tile) {
-            RAJA::tile<TEAM_Y_POLICY>(ctx, tile_size_y, r2, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &y_tile) {
-                RAJA::tile<TEAM_X_POLICY>(ctx, tile_size_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile) {
-
-                    RAJA::loop<THREAD_Z_POLICY>(ctx, z_tile, [&](INDEX_TYPE tz) {
-                        RAJA::loop<THREAD_Y_POLICY>(ctx, y_tile, [&](INDEX_TYPE ty) {
-                            RAJA::loop<THREAD_X_POLICY>(ctx, x_tile, [&](INDEX_TYPE tx) {
-
-                                auto idx = tx + N1 * (ty + N2 * tz);
-
-                                Aview(tz, ty, tx) = static_cast<INDEX_TYPE>(idx);
-
-                              });
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(threads_x, threads_y, threads_z)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::tile<TEAM_Z_POLICY>(
+              ctx, tile_size_z, r3,
+              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& z_tile)
+              {
+                RAJA::tile<TEAM_Y_POLICY>(
+                    ctx, tile_size_y, r2,
+                    [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& y_tile)
+                    {
+                      RAJA::tile<TEAM_X_POLICY>(
+                          ctx, tile_size_x, r1,
+                          [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile)
+                          {
+                            RAJA::loop<THREAD_Z_POLICY>(
+                                ctx, z_tile,
+                                [&](INDEX_TYPE tz)
+                                {
+                                  RAJA::loop<THREAD_Y_POLICY>(
+                                      ctx, y_tile,
+                                      [&](INDEX_TYPE ty)
+                                      {
+                                        RAJA::loop<THREAD_X_POLICY>(
+                                            ctx, x_tile,
+                                            [&](INDEX_TYPE tx)
+                                            {
+                                              auto idx =
+                                                  tx + N1 * (ty + N2 * tz);
+
+                                              Aview(tz, ty, tx) =
+                                                  static_cast<INDEX_TYPE>(idx);
+                                            });
+                                      });
+                                });
                           });
-                      });
-
-                  });
+                    });
               });
-          });
-    });
-  } else { // zero-length segment
+        });
+  }
+  else
+  {  // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
-
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(blocks_x, blocks_y ,blocks_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::tile<TEAM_Z_POLICY>(ctx, threads_z, r3, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &z_tile) {
-            RAJA::tile<TEAM_Y_POLICY>(ctx, threads_y, r2, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &y_tile) {
-                RAJA::tile<TEAM_X_POLICY>(ctx, threads_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile) {
-
-                    RAJA::loop<THREAD_Z_POLICY>(ctx, z_tile, [&](INDEX_TYPE RAJA_UNUSED_ARG(tz)) {
-                        RAJA::loop<THREAD_Y_POLICY>(ctx, y_tile, [&](INDEX_TYPE RAJA_UNUSED_ARG(ty)) {
-                            RAJA::loop<THREAD_X_POLICY>(ctx, x_tile, [&](INDEX_TYPE RAJA_UNUSED_ARG(tx)) {
-
-                                working_array[0]++;
-                                
-                              });
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
+
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(blocks_x, blocks_y, blocks_z)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::tile<TEAM_Z_POLICY>(
+              ctx, threads_z, r3,
+              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& z_tile)
+              {
+                RAJA::tile<TEAM_Y_POLICY>(
+                    ctx, threads_y, r2,
+                    [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& y_tile)
+                    {
+                      RAJA::tile<TEAM_X_POLICY>(
+                          ctx, threads_x, r1,
+                          [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile)
+                          {
+                            RAJA::loop<THREAD_Z_POLICY>(
+                                ctx, z_tile,
+                                [&](INDEX_TYPE RAJA_UNUSED_ARG(tz))
+                                {
+                                  RAJA::loop<THREAD_Y_POLICY>(
+                                      ctx, y_tile,
+                                      [&](INDEX_TYPE RAJA_UNUSED_ARG(ty))
+                                      {
+                                        RAJA::loop<THREAD_X_POLICY>(
+                                            ctx, x_tile,
+                                            [&](INDEX_TYPE RAJA_UNUSED_ARG(tx))
+                                            { working_array[0]++; });
+                                      });
+                                });
                           });
-                      });
-
-                  });
+                    });
               });
-          });
-      });
+        });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+    {
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+                check_array[RAJA::stripIndexType(i)]);
     }
-
-  } else {
+  }
+  else
+  {
 
     ASSERT_EQ(test_array[0], check_array[0]);
-
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -142,8 +179,7 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
 TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest);
 template <typename T>
 class LaunchNestedTileDirectTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(LaunchNestedTileDirectTest, RangeSegmentTeams)
@@ -151,33 +187,44 @@ TYPED_TEST_P(LaunchNestedTileDirectTest, RangeSegmentTeams)
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-
-  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using THREAD_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
-  using THREAD_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<3>>::type;
-
-  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<4>>::type;
-  using TEAM_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<5>>::type;
-  using TEAM_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<6>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+
+  using THREAD_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using THREAD_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
+  using THREAD_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<3>>::type;
+
+  using TEAM_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<4>>::type;
+  using TEAM_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<5>>::type;
+  using TEAM_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<6>>::type;
 
 
   // test zero-length range segment
-  LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
-    (INDEX_TYPE(0));
-
-  //Keep at one since we are doing a direct thread test
-  LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
-    (INDEX_TYPE(1));
-
-
+  LaunchNestedTileDirectTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, THREAD_Y_POLICY,
+      THREAD_Z_POLICY, TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+      INDEX_TYPE(0));
+
+  // Keep at one since we are doing a direct thread test
+  LaunchNestedTileDirectTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, THREAD_Y_POLICY,
+      THREAD_Z_POLICY, TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+      INDEX_TYPE(1));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest,
-                            RangeSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest, RangeSegmentTeams);
 
 #endif  // __TEST_LAUNCH_NESTED_TILE_DIRECT_HPP__
diff --git a/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp b/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp
index 07deab0376..790498dc2f 100644
--- a/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp
+++ b/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp
@@ -10,9 +10,15 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY, typename THREAD_Y_POLICY, typename THREAD_Z_POLICY,
-          typename TEAM_X_POLICY, typename TEAM_Y_POLICY, typename TEAM_Z_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY,
+          typename THREAD_Y_POLICY,
+          typename THREAD_Z_POLICY,
+          typename TEAM_X_POLICY,
+          typename TEAM_Y_POLICY,
+          typename TEAM_Z_POLICY>
 void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
 {
 
@@ -24,116 +30,151 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
   constexpr int blocks_y = 5;
   constexpr int blocks_z = 6;
 
-  //Add one to we check the bounds checking capability
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, threads_x*M + 1);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, threads_y*M + 1);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, threads_z*M + 1);
+  // Add one to we check the bounds checking capability
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, threads_x * M + 1);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, threads_y * M + 1);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, threads_z * M + 1);
 
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
   INDEX_TYPE N2 = static_cast<INDEX_TYPE>(r2.end() - r2.begin());
   INDEX_TYPE N3 = static_cast<INDEX_TYPE>(r3.end() - r3.begin());
 
-  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 *
-                                         N2 *
-                                         N3);
+  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 * N3);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
 
     constexpr int DIM = 3;
-    using layout_t = RAJA::Layout<DIM, INDEX_TYPE,DIM-1>;
+    using layout_t    = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
     RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N3, N2, N1);
 
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(threads_x, threads_y,threads_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::tile<TEAM_Z_POLICY>(ctx, threads_z, r3, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &z_tile) {
-            RAJA::tile<TEAM_Y_POLICY>(ctx, threads_y, r2, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &y_tile) {
-                RAJA::tile<TEAM_X_POLICY>(ctx, threads_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile) {
-
-                    RAJA::loop<THREAD_Z_POLICY>(ctx, z_tile, [&](INDEX_TYPE tz) {
-                        RAJA::loop<THREAD_Y_POLICY>(ctx, y_tile, [&](INDEX_TYPE ty) {
-                            RAJA::loop<THREAD_X_POLICY>(ctx, x_tile, [&](INDEX_TYPE tx) {
-
-                                auto idx = tx + N1 * (ty + N2 * tz);
-
-                                Aview(tz, ty, tx) = static_cast<INDEX_TYPE>(idx);
-                              });
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(threads_x, threads_y, threads_z)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::tile<TEAM_Z_POLICY>(
+              ctx, threads_z, r3,
+              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& z_tile)
+              {
+                RAJA::tile<TEAM_Y_POLICY>(
+                    ctx, threads_y, r2,
+                    [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& y_tile)
+                    {
+                      RAJA::tile<TEAM_X_POLICY>(
+                          ctx, threads_x, r1,
+                          [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile)
+                          {
+                            RAJA::loop<THREAD_Z_POLICY>(
+                                ctx, z_tile,
+                                [&](INDEX_TYPE tz)
+                                {
+                                  RAJA::loop<THREAD_Y_POLICY>(
+                                      ctx, y_tile,
+                                      [&](INDEX_TYPE ty)
+                                      {
+                                        RAJA::loop<THREAD_X_POLICY>(
+                                            ctx, x_tile,
+                                            [&](INDEX_TYPE tx)
+                                            {
+                                              auto idx =
+                                                  tx + N1 * (ty + N2 * tz);
+
+                                              Aview(tz, ty, tx) =
+                                                  static_cast<INDEX_TYPE>(idx);
+                                            });
+                                      });
+                                });
                           });
-                      });
-
-                  });
+                    });
               });
-          });
-    });
-  } else { // zero-length segment
+        });
+  }
+  else
+  {  // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
-
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(blocks_x, blocks_y ,blocks_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::tile<TEAM_Z_POLICY>(ctx, threads_z, r3, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &z_tile) {
-            RAJA::tile<TEAM_Y_POLICY>(ctx, threads_y, r2, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &y_tile) {
-                RAJA::tile<TEAM_X_POLICY>(ctx, threads_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile) {
-
-                    RAJA::loop<THREAD_Z_POLICY>(ctx, z_tile, [&](INDEX_TYPE tz) {
-                        RAJA::loop<THREAD_Y_POLICY>(ctx, y_tile, [&](INDEX_TYPE ty) {
-                            RAJA::loop<THREAD_X_POLICY>(ctx, x_tile, [&](INDEX_TYPE tx) {
-
-                                (void) tx;
-                                (void) ty;
-                                (void) tz;
-
-                                working_array[0]++;
-                              });
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
+
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(blocks_x, blocks_y, blocks_z)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::tile<TEAM_Z_POLICY>(
+              ctx, threads_z, r3,
+              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& z_tile)
+              {
+                RAJA::tile<TEAM_Y_POLICY>(
+                    ctx, threads_y, r2,
+                    [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& y_tile)
+                    {
+                      RAJA::tile<TEAM_X_POLICY>(
+                          ctx, threads_x, r1,
+                          [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile)
+                          {
+                            RAJA::loop<THREAD_Z_POLICY>(
+                                ctx, z_tile,
+                                [&](INDEX_TYPE tz)
+                                {
+                                  RAJA::loop<THREAD_Y_POLICY>(
+                                      ctx, y_tile,
+                                      [&](INDEX_TYPE ty)
+                                      {
+                                        RAJA::loop<THREAD_X_POLICY>(
+                                            ctx, x_tile,
+                                            [&](INDEX_TYPE tx)
+                                            {
+                                              (void)tx;
+                                              (void)ty;
+                                              (void)tz;
+
+                                              working_array[0]++;
+                                            });
+                                      });
+                                });
                           });
-                      });
-
-                  });
+                    });
               });
-          });
-      });
+        });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+    {
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+                check_array[RAJA::stripIndexType(i)]);
     }
-
-  } else {
+  }
+  else
+  {
 
     ASSERT_EQ(test_array[0], check_array[0]);
-
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -141,8 +182,7 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
 TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest);
 template <typename T>
 class LaunchNestedTileLoopTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(LaunchNestedTileLoopTest, RangeSegmentTeams)
@@ -150,33 +190,44 @@ TYPED_TEST_P(LaunchNestedTileLoopTest, RangeSegmentTeams)
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-
-  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using THREAD_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
-  using THREAD_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<3>>::type;
-
-  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<4>>::type;
-  using TEAM_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<5>>::type;
-  using TEAM_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<6>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+
+  using THREAD_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using THREAD_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
+  using THREAD_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<3>>::type;
+
+  using TEAM_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<4>>::type;
+  using TEAM_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<5>>::type;
+  using TEAM_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<6>>::type;
 
 
   // test zero-length range segment
-  LaunchNestedTileLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
-    (INDEX_TYPE(0));
-
-  //Keep at one since we are doing a direct thread test
-  LaunchNestedTileLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
-    (INDEX_TYPE(1));
-
-
+  LaunchNestedTileLoopTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, THREAD_Y_POLICY,
+      THREAD_Z_POLICY, TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+      INDEX_TYPE(0));
+
+  // Keep at one since we are doing a direct thread test
+  LaunchNestedTileLoopTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, THREAD_Y_POLICY,
+      THREAD_Z_POLICY, TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+      INDEX_TYPE(1));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest,
-                            RangeSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest, RangeSegmentTeams);
 
 #endif  // __TEST_LAUNCH_NESTED_TILE_LOOP_HPP__
diff --git a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp
index aed4b9618e..c07e8490ea 100644
--- a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp
+++ b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp
@@ -13,7 +13,8 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
           typename LAUNCH_POLICY,
           typename GLOBAL_THREAD_POLICY,
@@ -23,39 +24,36 @@ void LaunchReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
                                      camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks = (seg.size() - 1)/threads + 1;
+  int blocks            = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
   //
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
     test_array[i] = 13;
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> simpand(21);
 
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-
-          simpand &= working_array[idx];
-     });
-  });
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(
+            ctx, seg, [&](IDX_TYPE idx) { simpand &= working_array[idx]; });
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpand.get()), 5);
 
@@ -66,27 +64,32 @@ void LaunchReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE ref_and = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_and &= test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_and &= test_array[seg_idx[i]];
   }
 
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> redand(0);
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> redand2(2);
 
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-        redand  &= working_array[idx];
-        redand2 &= working_array[idx];
-    });
-  });
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
+                                         [&](IDX_TYPE idx)
+                                         {
+                                           redand &= working_array[idx];
+                                           redand2 &= working_array[idx];
+                                         });
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
   ASSERT_EQ(static_cast<DATA_TYPE>(redand2.get()), ref_and);
@@ -94,22 +97,21 @@ void LaunchReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
   redand.reset(0);
 
   const int nloops = 3;
-  for (int j = 0; j < nloops; ++j) {
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-          redand &= working_array[idx];
-      });
-    });
+  for (int j = 0; j < nloops; ++j)
+  {
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<GLOBAL_THREAD_POLICY>(
+              ctx, seg, [&](IDX_TYPE idx) { redand &= working_array[idx]; });
+        });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
@@ -117,79 +119,79 @@ void LaunchReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(LaunchReduceBitAndBasicTest);
 template <typename T>
 class LaunchReduceBitAndBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchReduceBitAndBasicTest, ReduceBitAndBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<1>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<1>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  LaunchReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                    r1, seg_idx, working_res);
+  LaunchReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  LaunchReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                    r2, seg_idx, working_res);
+  LaunchReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  LaunchReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                    r3, seg_idx, working_res);
+  LaunchReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  LaunchReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                    r4, seg_idx, working_res);
+  LaunchReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                          working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  LaunchReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                    r5, seg_idx, working_res);
+  LaunchReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                          working_res);
 
   // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
-  LaunchReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                    l1, seg_idx, working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  LaunchReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchReduceBitAndBasicTest,
diff --git a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp
index 3e8c86ffd8..eb3f55c1e5 100644
--- a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp
+++ b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp
@@ -13,7 +13,8 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
           typename LAUNCH_POLICY,
           typename GLOBAL_THREAD_POLICY,
@@ -23,32 +24,31 @@ void LaunchReduceMinBasicTestImpl(const SEG_TYPE& seg,
                                   camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks = (seg.size() - 1)/threads + 1;
+  int blocks            = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
-  const int modval = 100;
-  const DATA_TYPE min_init = modval+1;
+  const int modval          = 100;
+  const DATA_TYPE min_init  = modval + 1;
   const DATA_TYPE small_min = -modval;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_min = min_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_min = RAJA_MIN(test_array[ seg_idx[i] ], ref_min);
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_min = RAJA_MIN(test_array[seg_idx[i]], ref_min);
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -57,15 +57,17 @@ void LaunchReduceMinBasicTestImpl(const SEG_TYPE& seg,
   RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> mininit(small_min);
   RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> min(min_init);
 
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-
-          mininit.min( working_array[idx] );
-          min.min( working_array[idx] );
-    });
-  });
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
+                                         [&](IDX_TYPE idx)
+                                         {
+                                           mininit.min(working_array[idx]);
+                                           min.min(working_array[idx]);
+                                         });
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit.get()), small_min);
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min);
@@ -74,33 +76,31 @@ void LaunchReduceMinBasicTestImpl(const SEG_TYPE& seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), min_init);
 
   DATA_TYPE factor = 3;
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-
-          min.min( working_array[idx] * factor);
-    });
-  });
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(
+            ctx, seg,
+            [&](IDX_TYPE idx) { min.min(working_array[idx] * factor); });
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
 
   factor = 2;
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-
-          min.min( working_array[idx] * factor);
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(
+            ctx, seg,
+            [&](IDX_TYPE idx) { min.min(working_array[idx] * factor); });
       });
-  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
@@ -108,82 +108,85 @@ void LaunchReduceMinBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(LaunchReduceMinBasicTest);
 template <typename T>
 class LaunchReduceMinBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchReduceMinBasicTest, ReduceMinBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<1>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<1>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r1, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r2, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r3, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r4, seg_idx, working_res);
+  LaunchReduceMinBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                          working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r5, seg_idx, working_res);
+  LaunchReduceMinBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                          working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 l1, seg_idx, working_res);
+                               RAJA::TypedListSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchReduceMinBasicTest,
-                            ReduceMinBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(LaunchReduceMinBasicTest, ReduceMinBasicForall);
 
 #endif  // __TEST_LAUNCH_BASIC_REDUCEMIN_HPP__
diff --git a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp
index 798988f116..da783f96bd 100644
--- a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp
+++ b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp
@@ -13,7 +13,8 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
           typename LAUNCH_POLICY,
           typename GLOBAL_THREAD_POLICY,
@@ -24,30 +25,29 @@ void LaunchReduceSumBasicTestImpl(const SEG_TYPE& seg,
 {
 
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks = (seg.size() - 1)/threads + 1;
+  int blocks            = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_sum = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_sum += test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_sum += test_array[seg_idx[i]];
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -56,14 +56,17 @@ void LaunchReduceSumBasicTestImpl(const SEG_TYPE& seg,
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum(0);
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum2(2);
 
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-          sum  += working_array[idx];
-          sum2 += working_array[idx];
-     });
-  });
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
+                                         [&](IDX_TYPE idx)
+                                         {
+                                           sum += working_array[idx];
+                                           sum2 += working_array[idx];
+                                         });
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum.get()), ref_sum);
   ASSERT_EQ(static_cast<DATA_TYPE>(sum2.get()), ref_sum + 2);
@@ -72,23 +75,21 @@ void LaunchReduceSumBasicTestImpl(const SEG_TYPE& seg,
 
   const int nloops = 2;
 
-  for (int j = 0; j < nloops; ++j) {
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-            sum += working_array[idx];
-          });
-      });
+  for (int j = 0; j < nloops; ++j)
+  {
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<GLOBAL_THREAD_POLICY>(
+              ctx, seg, [&](IDX_TYPE idx) { sum += working_array[idx]; });
+        });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum.get()), nloops * ref_sum);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
@@ -96,81 +97,84 @@ void LaunchReduceSumBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(LaunchReduceSumBasicTest);
 template <typename T>
 class LaunchReduceSumBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchReduceSumBasicTest, ReduceSumBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<1>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<1>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r1, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r1, seg_idx, working_res);
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r2, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r3, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r4, seg_idx, working_res);
+  LaunchReduceSumBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                          working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r5, seg_idx, working_res);
+  LaunchReduceSumBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                          working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 l1, seg_idx, working_res);
+                               RAJA::TypedListSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchReduceSumBasicTest,
-                            ReduceSumBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(LaunchReduceSumBasicTest, ReduceSumBasicForall);
 
 #endif  // __TEST_LAUNCH_BASIC_REDUCESUM_HPP__
diff --git a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp
index 0a91d5f9b8..2adca71343 100644
--- a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp
+++ b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp
@@ -13,54 +13,51 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
           typename LAUNCH_POLICY,
           typename GLOBAL_THREAD_POLICY>
 
-void LaunchParamExptReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
-                                     const std::vector<IDX_TYPE>& seg_idx,
-                                     camp::resources::Resource working_res)
+void LaunchParamExptReduceBitAndBasicTestImpl(
+    const SEG_TYPE& seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    camp::resources::Resource working_res)
 {
   using REF_BITAND = RAJA::expt::ValOp<DATA_TYPE, RAJA::operators::bit_and>;
 
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks = (seg.size() - 1)/threads + 1;
+  int blocks            = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
   //
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
     test_array[i] = 13;
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE simpand(21);
 
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     RAJA::expt::Reduce<RAJA::operators::bit_and>(&simpand),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_BITAND &_simpand) {
-
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-
-          _simpand &= working_array[idx];
-     });
-
-  });
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      RAJA::expt::Reduce<RAJA::operators::bit_and>(&simpand),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_BITAND & _simpand)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(
+            ctx, seg, [&](IDX_TYPE idx) { _simpand &= working_array[idx]; });
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpand), 5);
 
@@ -71,29 +68,35 @@ void LaunchParamExptReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE ref_and = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_and &= test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_and &= test_array[seg_idx[i]];
   }
 
   DATA_TYPE redand(0);
   DATA_TYPE redand2(2);
 
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
-     RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand2),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_BITAND &_redand, REF_BITAND &_redand2) {
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-        _redand  &= working_array[idx];
-        _redand2 &= working_array[idx];
-    });
-  });
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
+      RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand2),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_BITAND & _redand,
+                           REF_BITAND & _redand2)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
+                                         [&](IDX_TYPE idx)
+                                         {
+                                           _redand &= working_array[idx];
+                                           _redand2 &= working_array[idx];
+                                         });
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand), ref_and);
   ASSERT_EQ(static_cast<DATA_TYPE>(redand2), ref_and);
@@ -101,23 +104,22 @@ void LaunchParamExptReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
   redand = 0;
 
   const int nloops = 3;
-  for (int j = 0; j < nloops; ++j) {
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-       RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
-       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_BITAND _redand) {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-          _redand &= working_array[idx];
-      });
-    });
+  for (int j = 0; j < nloops; ++j)
+  {
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_BITAND _redand)
+        {
+          RAJA::loop<GLOBAL_THREAD_POLICY>(
+              ctx, seg, [&](IDX_TYPE idx) { _redand &= working_array[idx]; });
+        });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand), ref_and);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
@@ -125,78 +127,80 @@ void LaunchParamExptReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(LaunchParamExptReduceBitAndBasicTest);
 template <typename T>
 class LaunchParamExptReduceBitAndBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchParamExptReduceBitAndBasicTest, ReduceBitAndBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<1>>::type;
-
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<1>>::type;
+
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                    r1, seg_idx, working_res);
+                                           RAJA::TypedRangeSegment<IDX_TYPE>,
+                                           LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                    r2, seg_idx, working_res);
+                                           RAJA::TypedRangeSegment<IDX_TYPE>,
+                                           LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                    r3, seg_idx, working_res);
+                                           RAJA::TypedRangeSegment<IDX_TYPE>,
+                                           LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                    r4, seg_idx, working_res);
+  LaunchParamExptReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                    r5, seg_idx, working_res);
+  LaunchParamExptReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                    l1, seg_idx, working_res);
+                                           RAJA::TypedListSegment<IDX_TYPE>,
+                                           LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchParamExptReduceBitAndBasicTest,
diff --git a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp
index 91ab75dbab..59840690a1 100644
--- a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp
+++ b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp
@@ -13,65 +13,66 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
           typename LAUNCH_POLICY,
           typename GLOBAL_THREAD_POLICY>
-void LaunchParamExptReduceMinBasicTestImpl(const SEG_TYPE& seg,
-                                           const std::vector<IDX_TYPE>& seg_idx,
-                                           camp::resources::Resource working_res)
+void LaunchParamExptReduceMinBasicTestImpl(
+    const SEG_TYPE& seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    camp::resources::Resource working_res)
 {
   using REF_MIN = RAJA::expt::ValOp<DATA_TYPE, RAJA::operators::minimum>;
 
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks = (seg.size() - 1)/threads + 1;
+  int blocks            = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
-  const int modval = 100;
-  const DATA_TYPE min_init = modval+1;
+  const int modval          = 100;
+  const DATA_TYPE min_init  = modval + 1;
   const DATA_TYPE small_min = -modval;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_min = min_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_min = RAJA_MIN(test_array[ seg_idx[i] ], ref_min);
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_min = RAJA_MIN(test_array[seg_idx[i]], ref_min);
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE mininit(small_min);
   DATA_TYPE min(min_init);
-  
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     "LaunchMinBasicTest",
-     RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
-     RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_MIN &_mininit, REF_MIN &_min) {
 
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-
-          _mininit.min(working_array[idx]);
-          _min.min(working_array[idx]);
-
-    });
-
-  });
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      "LaunchMinBasicTest",
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_MIN & _mininit,
+                           REF_MIN & _min)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
+                                         [&](IDX_TYPE idx)
+                                         {
+                                           _mininit.min(working_array[idx]);
+                                           _min.min(working_array[idx]);
+                                         });
+      });
 
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit), small_min);
@@ -81,38 +82,34 @@ void LaunchParamExptReduceMinBasicTestImpl(const SEG_TYPE& seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(min), min_init);
 
   DATA_TYPE factor = 3;
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_MIN &_min) {
-
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-
-          _min.min(working_array[idx] * factor);
-    });
-
-  });
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_MIN & _min)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(
+            ctx, seg,
+            [&](IDX_TYPE idx) { _min.min(working_array[idx] * factor); });
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
 
 
   factor = 2;
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_MIN &_min) {
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-
-          _min.min(working_array[idx] * factor);
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_MIN & _min)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(
+            ctx, seg,
+            [&](IDX_TYPE idx) { _min.min(working_array[idx] * factor); });
       });
-  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
@@ -120,78 +117,82 @@ void LaunchParamExptReduceMinBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(LaunchParamExptReduceMinBasicTest);
 template <typename T>
 class LaunchParamExptReduceMinBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchParamExptReduceMinBasicTest, ReduceMinBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<1>>::type;
-
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<1>>::type;
+
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                 r1, seg_idx, working_res);
+                                        RAJA::TypedRangeSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                 r2, seg_idx, working_res);
+                                        RAJA::TypedRangeSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                 r3, seg_idx, working_res);
+                                        RAJA::TypedRangeSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                 r4, seg_idx, working_res);
+                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                 r5, seg_idx, working_res);
+                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r5, seg_idx, working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                 l1, seg_idx, working_res);
+                                        RAJA::TypedListSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchParamExptReduceMinBasicTest,
diff --git a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp
index f6200628cf..d040dda264 100644
--- a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp
+++ b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp
@@ -13,41 +13,42 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
           typename LAUNCH_POLICY,
           typename GLOBAL_THREAD_POLICY>
-void LaunchParamExptReduceSumBasicTestImpl(const SEG_TYPE& seg,
-                                           const std::vector<IDX_TYPE>& seg_idx,
-                                           camp::resources::Resource working_res)
+void LaunchParamExptReduceSumBasicTestImpl(
+    const SEG_TYPE& seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    camp::resources::Resource working_res)
 {
   using REF_SUM = RAJA::expt::ValOp<DATA_TYPE, RAJA::operators::plus>;
 
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks = (seg.size() - 1)/threads + 1;
+  int blocks            = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_sum = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_sum += test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_sum += test_array[seg_idx[i]];
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -55,19 +56,20 @@ void LaunchParamExptReduceSumBasicTestImpl(const SEG_TYPE& seg,
 
   DATA_TYPE sum(0), sum2(2);
 
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     "LaunchSumBasicTest",
-     RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
-     RAJA::expt::Reduce<RAJA::operators::plus>(&sum2),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_SUM &_sum, REF_SUM &_sum2) {
-
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-          _sum  += working_array[idx];
-          _sum2 += working_array[idx];
-     });
-
-  });
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      "LaunchSumBasicTest", RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
+      RAJA::expt::Reduce<RAJA::operators::plus>(&sum2),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_SUM & _sum,
+                           REF_SUM & _sum2)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
+                                         [&](IDX_TYPE idx)
+                                         {
+                                           _sum += working_array[idx];
+                                           _sum2 += working_array[idx];
+                                         });
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum), ref_sum);
   ASSERT_EQ(static_cast<DATA_TYPE>(sum2), ref_sum + 2);
@@ -76,24 +78,22 @@ void LaunchParamExptReduceSumBasicTestImpl(const SEG_TYPE& seg,
 
   const int nloops = 2;
 
-  for (int j = 0; j < nloops; ++j) {
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-       RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
-       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_SUM &_sum) {
-
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-            _sum += working_array[idx];
-          });
-      });
+  for (int j = 0; j < nloops; ++j)
+  {
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_SUM & _sum)
+        {
+          RAJA::loop<GLOBAL_THREAD_POLICY>(
+              ctx, seg, [&](IDX_TYPE idx) { _sum += working_array[idx]; });
+        });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum), nloops * ref_sum);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
@@ -101,78 +101,82 @@ void LaunchParamExptReduceSumBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(LaunchParamExptReduceSumBasicTest);
 template <typename T>
 class LaunchParamExptReduceSumBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchParamExptReduceSumBasicTest, ReduceSumBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<1>>::type;
-
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<1>>::type;
+
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                         RAJA::TypedRangeSegment<IDX_TYPE>,
                                         LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                        r1, seg_idx, working_res);
-     
+      r1, seg_idx, working_res);
+
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                       RAJA::TypedRangeSegment<IDX_TYPE>,
-                                       LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                       r2, seg_idx, working_res);
+                                        RAJA::TypedRangeSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                       RAJA::TypedRangeSegment<IDX_TYPE>,
-                                       LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                       r3, seg_idx, working_res);
+                                        RAJA::TypedRangeSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                       RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                       LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                       r4, seg_idx, working_res);
+                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                       RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                       LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                       r5, seg_idx, working_res);
+                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r5, seg_idx, working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                       RAJA::TypedListSegment<IDX_TYPE>,
-                                      LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                      l1, seg_idx, working_res);
+                                        RAJA::TypedListSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchParamExptReduceSumBasicTest,
diff --git a/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp b/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp
index 702d5c6cd3..094aeb131d 100644
--- a/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp
+++ b/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp
@@ -10,98 +10,105 @@
 
 #include <numeric>
 
-template <typename WORKING_RES, typename LAUNCH_POLICY, typename TEAM_POLICY, typename THREAD_POLICY>
+template <typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename TEAM_POLICY,
+          typename THREAD_POLICY>
 void LaunchBasicSharedTestImpl()
 {
 
   int N = 1000;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   int* working_array;
   int* check_array;
   int* test_array;
 
-  allocateForallTestData<int>(N*N,
-                             working_res,
-                             &working_array,
-                             &check_array,
-                             &test_array);
+  allocateForallTestData<int>(N * N, working_res, &working_array, &check_array,
+                              &test_array);
 
 
-
-  //Select platform
+  // Select platform
   RAJA::ExecPlace select_cpu_or_gpu;
-  if (working_res.get_platform()  == camp::resources::Platform::host){
+  if (working_res.get_platform() == camp::resources::Platform::host)
+  {
     select_cpu_or_gpu = RAJA::ExecPlace::HOST;
-  }else{
+  }
+  else
+  {
     select_cpu_or_gpu = RAJA::ExecPlace::DEVICE;
   }
 
   size_t shared_mem_size = 1 * sizeof(int);
 
-  RAJA::launch<LAUNCH_POLICY>
-    (select_cpu_or_gpu,
-     RAJA::LaunchParams(RAJA::Teams(N), RAJA::Threads(N), shared_mem_size),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-          RAJA::loop<TEAM_POLICY>(ctx, RAJA::RangeSegment(0, N), [&](int r) {
-
-                // Array shared within threads of the same team
-              int * s_A = ctx.getSharedMemory<int>(1);
-
-                RAJA::loop<THREAD_POLICY>(ctx, RAJA::RangeSegment(0, 1), [&](int c) {
-                    s_A[c] = r;
-                });
-
-                ctx.teamSync();
-
-                //broadcast shared value to all threads and write to array
-                RAJA::loop<THREAD_POLICY>(ctx, RAJA::RangeSegment(0, N), [&](int c) {
-                    const int idx = c + N*r;
-                    working_array[idx] = s_A[0];
-                });  // loop j
-
-                ctx.releaseSharedMemory();
-              });  // loop r
-        });  // outer lambda
-
-
-
-  working_res.memcpy(check_array, working_array, sizeof(int) * N*N);
-
-  for(int r = 0; r < N; ++r) {
-    for (int c = 0; c < N; c++) {
-      ASSERT_EQ(r, check_array[c + r*N]);
+  RAJA::launch<LAUNCH_POLICY>(
+      select_cpu_or_gpu,
+      RAJA::LaunchParams(RAJA::Teams(N), RAJA::Threads(N), shared_mem_size),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<TEAM_POLICY>(
+            ctx, RAJA::RangeSegment(0, N),
+            [&](int r)
+            {
+              // Array shared within threads of the same team
+              int* s_A = ctx.getSharedMemory<int>(1);
+
+              RAJA::loop<THREAD_POLICY>(ctx, RAJA::RangeSegment(0, 1),
+                                        [&](int c) { s_A[c] = r; });
+
+              ctx.teamSync();
+
+              // broadcast shared value to all threads and write to array
+              RAJA::loop<THREAD_POLICY>(ctx, RAJA::RangeSegment(0, N),
+                                        [&](int c)
+                                        {
+                                          const int idx      = c + N * r;
+                                          working_array[idx] = s_A[0];
+                                        });  // loop j
+
+              ctx.releaseSharedMemory();
+            });  // loop r
+      });        // outer lambda
+
+
+  working_res.memcpy(check_array, working_array, sizeof(int) * N * N);
+
+  for (int r = 0; r < N; ++r)
+  {
+    for (int c = 0; c < N; c++)
+    {
+      ASSERT_EQ(r, check_array[c + r * N]);
     }
   }
 
-  deallocateForallTestData<int>(working_res,
-                               working_array,
-                               check_array,
-                               test_array);
+  deallocateForallTestData<int>(working_res, working_array, check_array,
+                                test_array);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchBasicSharedTest);
 template <typename T>
 class LaunchBasicSharedTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchBasicSharedTest, BasicSharedTeams)
 {
 
   using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<1>>::type, camp::num<0>>::type;
-  using TEAM_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<1>>::type, camp::num<1>>::type;
-  using THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<1>>::type, camp::num<2>>::type;
-
-  LaunchBasicSharedTestImpl<WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY>();
-
-
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<1>>::type,
+                        camp::num<0>>::type;
+  using TEAM_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<1>>::type,
+                        camp::num<1>>::type;
+  using THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<1>>::type,
+                        camp::num<2>>::type;
+
+  LaunchBasicSharedTestImpl<WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
+                            THREAD_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchBasicSharedTest,
-                            BasicSharedTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchBasicSharedTest, BasicSharedTeams);
 
 #endif  // __TEST_BASIC_SHARED_HPP__
diff --git a/test/functional/launch/segment/tests/test-launch-ListSegment.hpp b/test/functional/launch/segment/tests/test-launch-ListSegment.hpp
index 9ed358208f..8faa4111ad 100644
--- a/test/functional/launch/segment/tests/test-launch-ListSegment.hpp
+++ b/test/functional/launch/segment/tests/test-launch-ListSegment.hpp
@@ -15,101 +15,114 @@
 #include <algorithm>
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POICY>
 void LaunchListSegmentTestImpl(INDEX_TYPE N)
 {
 
   // Create and initialize indices in idx_array used to create list segment
   std::vector<INDEX_TYPE> idx_array;
 
-  srand ( time(NULL) );
+  srand(time(NULL));
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i)
+  {
     INDEX_TYPE randval = INDEX_TYPE(rand() % RAJA::stripIndexType(N));
-    if ( i < randval ) {
+    if (i < randval)
+    {
       idx_array.push_back(i);
     }
   }
 
   size_t idxlen = idx_array.size();
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   // Create list segment for tests
   INDEX_TYPE* idx_vals = nullptr;
-  if (N > 0) {
+  if (N > 0)
+  {
     idx_vals = &idx_array[0];
   }
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(idx_vals, idxlen,
-                                          working_res);
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(idx_vals, idxlen, working_res);
 
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   constexpr int threads = 256;
-  int blocks = (data_len - 1)/threads + 1;
+  int blocks            = (data_len - 1) / threads + 1;
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
-    for (size_t i = 0; i < idxlen; ++i) {
-      test_array[ RAJA::stripIndexType(idx_vals[i]) ] = idx_vals[i];
+    for (size_t i = 0; i < idxlen; ++i)
+    {
+      test_array[RAJA::stripIndexType(idx_vals[i])] = idx_vals[i];
     }
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
-
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<GLOBAL_THREAD_POICY>(ctx, lseg, [&](INDEX_TYPE idx) {
-            working_array[RAJA::stripIndexType(idx)] = idx;
-          });
-      });
-
-  } else { // zero-length segment
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
+
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<GLOBAL_THREAD_POICY>(
+              ctx, lseg,
+              [&](INDEX_TYPE idx)
+              { working_array[RAJA::stripIndexType(idx)] = idx; });
+        });
+  }
+  else
+  {  // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
-
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<GLOBAL_THREAD_POICY>(ctx, lseg, [&](INDEX_TYPE idx) {
-            (void) idx;
-            working_array[0]++;
-          });
-      });
-
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
+
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<GLOBAL_THREAD_POICY>(ctx, lseg,
+                                          [&](INDEX_TYPE idx)
+                                          {
+                                            (void)idx;
+                                            working_array[0]++;
+                                          });
+        });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0) {
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  if (RAJA::stripIndexType(N) > 0)
+  {
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+    {
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+                check_array[RAJA::stripIndexType(i)]);
     }
-  } else {
+  }
+  else
+  {
     ASSERT_EQ(test_array[0], check_array[0]);
   }
 
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -117,27 +130,33 @@ void LaunchListSegmentTestImpl(INDEX_TYPE N)
 TYPED_TEST_SUITE_P(LaunchListSegmentTest);
 template <typename T>
 class LaunchListSegmentTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchListSegmentTest, ListSegmentTeams)
 {
   using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
 
   // test zero-length list segment
-  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0));
+  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY,
+                            GLOBAL_THREAD_POLICY>(INDEX_TYPE(0));
 
-  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(13));
+  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY,
+                            GLOBAL_THREAD_POLICY>(INDEX_TYPE(13));
 
-  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(2047));
+  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY,
+                            GLOBAL_THREAD_POLICY>(INDEX_TYPE(2047));
 
-  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(32000));
+  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY,
+                            GLOBAL_THREAD_POLICY>(INDEX_TYPE(32000));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchListSegmentTest,
-                            ListSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchListSegmentTest, ListSegmentTeams);
 
 #endif  // __TEST_TEAMS_LISTSEGMENT_HPP__
diff --git a/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp b/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp
index aa2cb2c4b5..2d36a6316b 100644
--- a/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp
+++ b/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp
@@ -10,86 +10,87 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POICY>
 void LaunchRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
+                                         RAJA::stripIndexType(last));
   INDEX_TYPE N = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   constexpr int threads = 256;
-  int blocks = (data_len - 1)/threads + 1;
+  int blocks            = (data_len - 1) / threads + 1;
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     const INDEX_TYPE rbegin = *r1.begin();
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
 
     RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<GLOBAL_THREAD_POICY>(
-          ctx, r1, [&](INDEX_TYPE idx) {
-            working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
-          }
-        );
-      }
-    );
-
-  } else { // zero-length segment
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<GLOBAL_THREAD_POICY>(
+              ctx, r1,
+              [&](INDEX_TYPE idx)
+              { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
+        });
+  }
+  else
+  {  // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),  [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<GLOBAL_THREAD_POICY>(
-          ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(idx)) {
-            working_array[0]++;
-          }
-        );
-      }
-    );
-
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<GLOBAL_THREAD_POICY>(ctx, r1,
+                                          [&](INDEX_TYPE RAJA_UNUSED_ARG(idx))
+                                          { working_array[0]++; });
+        });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0) {
-    
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  if (RAJA::stripIndexType(N) > 0)
+  {
+
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+    {
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+                check_array[RAJA::stripIndexType(i)]);
     }
-    
-  } else {
-    
+  }
+  else
+  {
+
     ASSERT_EQ(test_array[0], check_array[0]);
-    
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -97,24 +98,36 @@ void LaunchRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 TYPED_TEST_SUITE_P(LaunchRangeSegmentTest);
 template <typename T>
 class LaunchRangeSegmentTest : public ::testing::Test
-{
-};
-
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POLICY,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{};
+
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
-{
-}
-
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POLICY,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{}
+
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(-5));
-
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0));
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5));
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5),
+                                                   INDEX_TYPE(-5));
+
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5),
+                                                   INDEX_TYPE(0));
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5),
+                                                   INDEX_TYPE(5));
 }
 
 TYPED_TEST_P(LaunchRangeSegmentTest, RangeSegmentTeams)
@@ -122,20 +135,32 @@ TYPED_TEST_P(LaunchRangeSegmentTest, RangeSegmentTeams)
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
 
   // test zero-length range segment
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3));
-
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0), INDEX_TYPE(27));
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(2047));
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(32000));
-
-  runNegativeTests<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>();
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(3),
+                                                   INDEX_TYPE(3));
+
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(0),
+                                                   INDEX_TYPE(27));
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(1),
+                                                   INDEX_TYPE(2047));
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(1),
+                                                   INDEX_TYPE(32000));
+
+  runNegativeTests<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                   GLOBAL_THREAD_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchRangeSegmentTest,
-                            RangeSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchRangeSegmentTest, RangeSegmentTeams);
 
 #endif  // __TEST_RANGE_SEGMENT_HPP__
diff --git a/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp b/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp
index 94a1a77bcf..d25d46ce8f 100644
--- a/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp
+++ b/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp
@@ -10,90 +10,94 @@
 
 #include <cstring>
 
-template <typename INDEX_TYPE, typename DIFF_TYPE,
-          typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POICY>
-void LaunchRangeStrideSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last,
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POICY>
+void LaunchRangeStrideSegmentTestImpl(INDEX_TYPE first,
+                                      INDEX_TYPE last,
                                       DIFF_TYPE stride)
 {
-  RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
+  RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(
+      RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
   INDEX_TYPE N = INDEX_TYPE(r1.size());
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource host_res {camp::resources::Host()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
   constexpr int threads = 256;
-  int blocks = (data_len - 1)/threads + 1;
+  int blocks            = (data_len - 1) / threads + 1;
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     INDEX_TYPE idx = first;
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
-      test_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i)
+    {
+      test_array[RAJA::stripIndexType((idx - first) / stride)] = idx;
       idx += stride;
     }
 
     RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<GLOBAL_THREAD_POICY>(
-          ctx, r1, [&](INDEX_TYPE idx) {
-            working_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
-          }
-        );
-
-      }
-    );
-
-  } else { // zero-length segment
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<GLOBAL_THREAD_POICY>(
+              ctx, r1,
+              [&](INDEX_TYPE idx) {
+                working_array[RAJA::stripIndexType((idx - first) / stride)] =
+                    idx;
+              });
+        });
+  }
+  else
+  {  // zero-length segment
 
     RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<GLOBAL_THREAD_POICY>(
-          ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(idx)) {
-            working_array[0]++;
-          }
-        );
-
-      }
-    );
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<GLOBAL_THREAD_POICY>(ctx, r1,
+                                          [&](INDEX_TYPE RAJA_UNUSED_ARG(idx))
+                                          { working_array[0]++; });
+        });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+    {
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+                check_array[RAJA::stripIndexType(i)]);
     }
-
-  } else {
+  }
+  else
+  {
 
     ASSERT_EQ(test_array[0], check_array[0]);
-
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -101,26 +105,44 @@ void LaunchRangeStrideSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last,
 TYPED_TEST_SUITE_P(LaunchRangeStrideSegmentTest);
 template <typename T>
 class LaunchRangeStrideSegmentTest : public ::testing::Test
-{
-};
-
-template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POICY,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{};
+
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
-{
-}
-
-template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POLICY,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{}
+
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
 {
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
-
-// Test negative strides
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
+
+  // Test negative strides
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
 }
 
 
@@ -128,23 +150,47 @@ TYPED_TEST_P(LaunchRangeStrideSegmentTest, RangeStrideSegmentTeams)
 {
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using DIFF_TYPE   = typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
-
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
-
-// Test size zero segments
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
-
-  runNegativeStrideTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>();
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using DIFF_TYPE =
+      typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
+
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
+
+  // Test size zero segments
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
+
+  runNegativeStrideTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY,
+                         GLOBAL_THREAD_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchRangeStrideSegmentTest,
diff --git a/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp b/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
index 8da7b81eb7..0d2fa4d789 100644
--- a/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
+++ b/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
@@ -10,81 +10,105 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename TEAM_POLICY, typename THREAD_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename TEAM_POLICY,
+          typename THREAD_POLICY>
 void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> outer_range(RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(block_range));
-  RAJA::TypedRangeSegment<INDEX_TYPE> inner_range(RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(thread_range));
+  RAJA::TypedRangeSegment<INDEX_TYPE> outer_range(
+      RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(block_range));
+  RAJA::TypedRangeSegment<INDEX_TYPE> inner_range(
+      RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(thread_range));
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  size_t data_len = RAJA::stripIndexType(block_range)*RAJA::stripIndexType(thread_range);
+  size_t data_len =
+      RAJA::stripIndexType(block_range) * RAJA::stripIndexType(thread_range);
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  //determine the underlying type of block_range
+  // determine the underlying type of block_range
   using s_type = decltype(RAJA::stripIndexType(block_range));
 
-  for(s_type b=0; b<RAJA::stripIndexType(block_range); ++b) {
-    for(s_type c=0; c<RAJA::stripIndexType(thread_range); ++c) {
-      s_type idx = c + RAJA::stripIndexType(thread_range)*b;
+  for (s_type b = 0; b < RAJA::stripIndexType(block_range); ++b)
+  {
+    for (s_type c = 0; c < RAJA::stripIndexType(thread_range); ++c)
+    {
+      s_type idx      = c + RAJA::stripIndexType(thread_range) * b;
       test_array[idx] = INDEX_TYPE(idx) + INDEX_TYPE(c);
     }
   }
 
-  size_t shared_mem_size = RAJA::stripIndexType(thread_range)*sizeof(INDEX_TYPE);
-
-  //Use an int type to test the bump style allocator.
-  //Key idea is that we are requesting different amounts.
-  shared_mem_size += RAJA::stripIndexType(thread_range)*sizeof(int);
-
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(RAJA::stripIndexType(block_range)),
-                        RAJA::Threads(RAJA::stripIndexType(thread_range)), shared_mem_size),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-      RAJA::loop<TEAM_POLICY>(ctx, outer_range, [&](INDEX_TYPE bid) {
-
-          INDEX_TYPE * tile_ptr = ctx.getSharedMemory<INDEX_TYPE>(RAJA::stripIndexType(thread_range));
-          RAJA::View<INDEX_TYPE, RAJA::Layout<1>> Tile(tile_ptr, RAJA::stripIndexType(thread_range));
-
-          int * int_tile_ptr = ctx.getSharedMemory<int>(RAJA::stripIndexType(thread_range));
-          RAJA::View<int, RAJA::Layout<1>> Int_Tile(int_tile_ptr, RAJA::stripIndexType(thread_range));
-
-          RAJA::loop<THREAD_POLICY>(ctx, inner_range, [&](INDEX_TYPE tid) {
-              Int_Tile(RAJA::stripIndexType(tid)) = RAJA::stripIndexType(tid);
-              Tile(RAJA::stripIndexType(thread_range)-RAJA::stripIndexType(tid)-1) = thread_range-tid-1 + thread_range*bid;
+  size_t shared_mem_size =
+      RAJA::stripIndexType(thread_range) * sizeof(INDEX_TYPE);
+
+  // Use an int type to test the bump style allocator.
+  // Key idea is that we are requesting different amounts.
+  shared_mem_size += RAJA::stripIndexType(thread_range) * sizeof(int);
+
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(RAJA::stripIndexType(block_range)),
+                         RAJA::Threads(RAJA::stripIndexType(thread_range)),
+                         shared_mem_size),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<TEAM_POLICY>(
+            ctx, outer_range,
+            [&](INDEX_TYPE bid)
+            {
+              INDEX_TYPE* tile_ptr = ctx.getSharedMemory<INDEX_TYPE>(
+                  RAJA::stripIndexType(thread_range));
+              RAJA::View<INDEX_TYPE, RAJA::Layout<1>> Tile(
+                  tile_ptr, RAJA::stripIndexType(thread_range));
+
+              int* int_tile_ptr =
+                  ctx.getSharedMemory<int>(RAJA::stripIndexType(thread_range));
+              RAJA::View<int, RAJA::Layout<1>> Int_Tile(
+                  int_tile_ptr, RAJA::stripIndexType(thread_range));
+
+              RAJA::loop<THREAD_POLICY>(
+                  ctx, inner_range,
+                  [&](INDEX_TYPE tid)
+                  {
+                    Int_Tile(RAJA::stripIndexType(tid)) =
+                        RAJA::stripIndexType(tid);
+                    Tile(RAJA::stripIndexType(thread_range) -
+                         RAJA::stripIndexType(tid) - 1) =
+                        thread_range - tid - 1 + thread_range * bid;
+                  });
+
+              ctx.teamSync();
+
+              RAJA::loop<THREAD_POLICY>(
+                  ctx, inner_range,
+                  [&](INDEX_TYPE tid)
+                  {
+                    INDEX_TYPE idx = tid + thread_range * bid;
+                    working_array[RAJA::stripIndexType(idx)] =
+                        Tile(RAJA::stripIndexType(tid)) +
+                        Int_Tile(RAJA::stripIndexType(tid));
+                  });
+
+              ctx.releaseSharedMemory();
             });
-
-          ctx.teamSync();
-
-          RAJA::loop<THREAD_POLICY>(ctx, inner_range, [&](INDEX_TYPE tid) {
-              INDEX_TYPE idx = tid + thread_range * bid;
-              working_array[RAJA::stripIndexType(idx)] = Tile(RAJA::stripIndexType(tid)) + Int_Tile(RAJA::stripIndexType(tid));
-          });
-
-          ctx.releaseSharedMemory();
-        });
-
-    });
+      });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (size_t i = 0; i < data_len; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (size_t i = 0; i < data_len; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -92,28 +116,31 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
 TYPED_TEST_SUITE_P(LaunchDynamicMemTest);
 template <typename T>
 class LaunchDynamicMemTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchDynamicMemTest, DynamicMemLaunch)
 {
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-  using TEAM_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
-
-
-  LaunchDynamicMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY>
-    (INDEX_TYPE(4), INDEX_TYPE(2));
-
-  LaunchDynamicMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY>
-    (INDEX_TYPE(5), INDEX_TYPE(32));
-
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+  using TEAM_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
+
+
+  LaunchDynamicMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
+                           THREAD_POLICY>(INDEX_TYPE(4), INDEX_TYPE(2));
+
+  LaunchDynamicMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
+                           THREAD_POLICY>(INDEX_TYPE(5), INDEX_TYPE(32));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchDynamicMemTest,
-                            DynamicMemLaunch);
+REGISTER_TYPED_TEST_SUITE_P(LaunchDynamicMemTest, DynamicMemLaunch);
 
 #endif  // __TEST_DYNAMIC_MEM_HPP__
diff --git a/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp b/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp
index 63b488115b..a424015398 100644
--- a/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp
+++ b/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp
@@ -10,80 +10,98 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename TEAM_POLICY, typename THREAD_POLICY,
-int THREAD_RANGE>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename TEAM_POLICY,
+          typename THREAD_POLICY,
+          int THREAD_RANGE>
 void LaunchStaticMemTestImpl(INDEX_TYPE block_range)
 {
 
   INDEX_TYPE thread_range(THREAD_RANGE);
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> outer_range(RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(block_range));
-  RAJA::TypedRangeSegment<INDEX_TYPE> inner_range(RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(thread_range));
+  RAJA::TypedRangeSegment<INDEX_TYPE> outer_range(
+      RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(block_range));
+  RAJA::TypedRangeSegment<INDEX_TYPE> inner_range(
+      RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(thread_range));
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  size_t data_len = RAJA::stripIndexType(block_range)*RAJA::stripIndexType(thread_range);
+  size_t data_len =
+      RAJA::stripIndexType(block_range) * RAJA::stripIndexType(thread_range);
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  //determine the underlying type of block_range
+  // determine the underlying type of block_range
   using s_type = decltype(RAJA::stripIndexType(block_range));
-  
-  for(s_type b=0; b<RAJA::stripIndexType(block_range); ++b) {
-    for(s_type c=0; c<RAJA::stripIndexType(thread_range); ++c) {
-      s_type idx = c + RAJA::stripIndexType(thread_range)*b;
+
+  for (s_type b = 0; b < RAJA::stripIndexType(block_range); ++b)
+  {
+    for (s_type c = 0; c < RAJA::stripIndexType(thread_range); ++c)
+    {
+      s_type idx      = c + RAJA::stripIndexType(thread_range) * b;
       test_array[idx] = INDEX_TYPE(idx);
     }
   }
 
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(RAJA::stripIndexType(block_range)),
-                        RAJA::Threads(RAJA::stripIndexType(thread_range))),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-      RAJA::loop<TEAM_POLICY>(ctx, outer_range, [&](INDEX_TYPE bid) {
-
-          //Since we are using custom index type we have to first use a
-          //type that the device compiler can intialize, we can then use a
-          //pointer to recast the shared memory to our desired type.
-          //This enables us to work around the following warning:
-          // warning #3019-D: dynamic initialization is not supported for
-          //a function-scope static __shared__ variable within a __device__/__global__ function
-          RAJA_TEAM_SHARED char char_Tile[THREAD_RANGE*sizeof(INDEX_TYPE)];
-          INDEX_TYPE *Tile = (INDEX_TYPE *)char_Tile;
-
-          RAJA::loop<THREAD_POLICY>(ctx, inner_range, [&](INDEX_TYPE tid) {
-              Tile[RAJA::stripIndexType(thread_range)-RAJA::stripIndexType(tid)-1] = thread_range-tid-1 + thread_range*bid;
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(RAJA::stripIndexType(block_range)),
+                         RAJA::Threads(RAJA::stripIndexType(thread_range))),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<TEAM_POLICY>(
+            ctx, outer_range,
+            [&](INDEX_TYPE bid)
+            {
+              // Since we are using custom index type we have to first use a
+              // type that the device compiler can intialize, we can then use a
+              // pointer to recast the shared memory to our desired type.
+              // This enables us to work around the following warning:
+              //  warning #3019-D: dynamic initialization is not supported for
+              // a function-scope static __shared__ variable within a
+              // __device__/__global__ function
+              RAJA_TEAM_SHARED char
+                  char_Tile[THREAD_RANGE * sizeof(INDEX_TYPE)];
+              INDEX_TYPE* Tile = (INDEX_TYPE*)char_Tile;
+
+              RAJA::loop<THREAD_POLICY>(
+                  ctx, inner_range,
+                  [&](INDEX_TYPE tid)
+                  {
+                    Tile[RAJA::stripIndexType(thread_range) -
+                         RAJA::stripIndexType(tid) - 1] =
+                        thread_range - tid - 1 + thread_range * bid;
+                  });
+
+              ctx.teamSync();
+
+              RAJA::loop<THREAD_POLICY>(
+                  ctx, inner_range,
+                  [&](INDEX_TYPE tid)
+                  {
+                    INDEX_TYPE idx = tid + thread_range * bid;
+                    working_array[RAJA::stripIndexType(idx)] =
+                        Tile[RAJA::stripIndexType(tid)];
+                  });
+
+              ctx.releaseSharedMemory();
             });
-
-          ctx.teamSync();
-
-          RAJA::loop<THREAD_POLICY>(ctx, inner_range, [&](INDEX_TYPE tid) {
-              INDEX_TYPE idx = tid + thread_range * bid;
-              working_array[RAJA::stripIndexType(idx)] = Tile[RAJA::stripIndexType(tid)];
-          });
-
-          ctx.releaseSharedMemory();
-        });
-
-    });
+      });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (size_t i = 0; i < data_len; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (size_t i = 0; i < data_len; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -91,28 +109,31 @@ void LaunchStaticMemTestImpl(INDEX_TYPE block_range)
 TYPED_TEST_SUITE_P(LaunchStaticMemTest);
 template <typename T>
 class LaunchStaticMemTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchStaticMemTest, StaticMemLaunch)
 {
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-  using TEAM_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
-
-
-  LaunchStaticMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY, 2>
-    (INDEX_TYPE(4));
-
-  LaunchStaticMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY, 32>
-    (INDEX_TYPE(5));
-
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+  using TEAM_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
+
+
+  LaunchStaticMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
+                          THREAD_POLICY, 2>(INDEX_TYPE(4));
+
+  LaunchStaticMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
+                          THREAD_POLICY, 32>(INDEX_TYPE(5));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchStaticMemTest,
-                            StaticMemLaunch);
+REGISTER_TYPED_TEST_SUITE_P(LaunchStaticMemTest, StaticMemLaunch);
 
 #endif  // __TEST_DYNAMIC_MEM_HPP__
diff --git a/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp b/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp
index 72d59d290a..48aed7a007 100644
--- a/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp
+++ b/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp
@@ -10,23 +10,26 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY, typename TEAM_X_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY,
+          typename TEAM_X_POLICY>
 void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
 {
 
-  constexpr int threads_x   = 4;
-  constexpr int blocks_x    = 4;
+  constexpr int threads_x = 4;
+  constexpr int blocks_x  = 4;
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, M*threads_x+1);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, M * threads_x + 1);
 
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
 
-  INDEX_TYPE no_tiles = (N1-1)/threads_x + 1;
+  INDEX_TYPE no_tiles = (N1 - 1) / threads_x + 1;
 
   INDEX_TYPE N = static_cast<INDEX_TYPE>(RAJA::stripIndexType(N1));
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_ttile_array;
   INDEX_TYPE* check_ttile_array;
   INDEX_TYPE* test_ttile_array;
@@ -36,80 +39,89 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
   INDEX_TYPE* test_iloop_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_ttile_array,
-                                     &check_ttile_array,
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res,
+                                     &working_ttile_array, &check_ttile_array,
                                      &test_ttile_array);
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_iloop_array,
-                                     &check_iloop_array,
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res,
+                                     &working_iloop_array, &check_iloop_array,
                                      &test_iloop_array);
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     std::iota(test_ttile_array, test_ttile_array + RAJA::stripIndexType(N), 0);
     std::iota(test_iloop_array, test_iloop_array + RAJA::stripIndexType(N), 0);
 
     RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(threads_x)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::tile_tcount<TEAM_X_POLICY>(
-          ctx, threads_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile, INDEX_TYPE bx) {
-            RAJA::loop_icount<THREAD_X_POLICY>(
-              ctx, x_tile, [&](INDEX_TYPE tx, INDEX_TYPE ix) {
-
-                working_ttile_array[tx] = bx;
-                working_iloop_array[tx] = ix;
-
-              }
-            );
-          }
-        );
-      }
-    );
-
-  } else { // zero-length segment
+        RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(threads_x)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::tile_tcount<TEAM_X_POLICY>(
+              ctx, threads_x, r1,
+              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile,
+                  INDEX_TYPE bx)
+              {
+                RAJA::loop_icount<THREAD_X_POLICY>(
+                    ctx, x_tile,
+                    [&](INDEX_TYPE tx, INDEX_TYPE ix)
+                    {
+                      working_ttile_array[tx] = bx;
+                      working_iloop_array[tx] = ix;
+                    });
+              });
+        });
+  }
+  else
+  {  // zero-length segment
 
-    memset(static_cast<void*>(test_ttile_array), 0, sizeof(INDEX_TYPE) * data_len);
+    memset(static_cast<void*>(test_ttile_array), 0,
+           sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_ttile_array, test_ttile_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_ttile_array, test_ttile_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(blocks_x)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::tile_tcount<TEAM_X_POLICY>(
-          ctx, threads_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile, INDEX_TYPE RAJA_UNUSED_ARG(bx)) {
-            RAJA::loop_icount<THREAD_X_POLICY>(
-              ctx, x_tile, [&](INDEX_TYPE RAJA_UNUSED_ARG(tx), INDEX_TYPE RAJA_UNUSED_ARG (ix)) {
-
-                working_ttile_array[0]++;
-                working_iloop_array[0]++;
-
-              }
-            );
-          }
-        );
-      }
-    );
+        RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(blocks_x)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::tile_tcount<TEAM_X_POLICY>(
+              ctx, threads_x, r1,
+              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile,
+                  INDEX_TYPE RAJA_UNUSED_ARG(bx))
+              {
+                RAJA::loop_icount<THREAD_X_POLICY>(
+                    ctx, x_tile,
+                    [&](INDEX_TYPE RAJA_UNUSED_ARG(tx),
+                        INDEX_TYPE RAJA_UNUSED_ARG(ix))
+                    {
+                      working_ttile_array[0]++;
+                      working_iloop_array[0]++;
+                    });
+              });
+        });
   }
 
-  working_res.memcpy(check_ttile_array, working_ttile_array, sizeof(INDEX_TYPE) * data_len);
-  working_res.memcpy(check_iloop_array, working_iloop_array, sizeof(INDEX_TYPE) * data_len);
+  working_res.memcpy(check_ttile_array, working_ttile_array,
+                     sizeof(INDEX_TYPE) * data_len);
+  working_res.memcpy(check_iloop_array, working_iloop_array,
+                     sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     INDEX_TYPE idx = 0;
-    for (INDEX_TYPE bx = INDEX_TYPE(0); bx < no_tiles; ++bx) {
-      for (INDEX_TYPE tx = INDEX_TYPE(0); tx < threads_x; ++tx) {
+    for (INDEX_TYPE bx = INDEX_TYPE(0); bx < no_tiles; ++bx)
+    {
+      for (INDEX_TYPE tx = INDEX_TYPE(0); tx < threads_x; ++tx)
+      {
 
-        if(idx >= N1) break;
+        if (idx >= N1) break;
 
         ASSERT_EQ(check_ttile_array[RAJA::stripIndexType(idx)], bx);
         ASSERT_EQ(check_iloop_array[RAJA::stripIndexType(idx)], tx);
@@ -117,31 +129,26 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
         idx++;
       }
     }
-    
-  } else {
-    
+  }
+  else
+  {
+
     ASSERT_EQ(check_ttile_array[0], check_ttile_array[0]);
     ASSERT_EQ(check_iloop_array[0], check_iloop_array[0]);
-    
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_ttile_array,
-                                       check_ttile_array,
-                                       test_ttile_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_ttile_array,
+                                       check_ttile_array, test_ttile_array);
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_iloop_array,
-                                       check_iloop_array,
-                                       test_iloop_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_iloop_array,
+                                       check_iloop_array, test_iloop_array);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest);
 template <typename T>
 class LaunchNestedTileDirectTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(LaunchNestedTileDirectTest, RangeSegmentTeams)
@@ -149,30 +156,30 @@ TYPED_TEST_P(LaunchNestedTileDirectTest, RangeSegmentTeams)
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
 
-  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
+  using TEAM_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using THREAD_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
 
 
   // test zero-length range segment
   LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                           THREAD_X_POLICY, TEAM_X_POLICY>
-    (INDEX_TYPE(0));
+                                 THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(0));
 
-  //Keep at one since we are doing a direct thread test
+  // Keep at one since we are doing a direct thread test
   LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                                 THREAD_X_POLICY, TEAM_X_POLICY>
-    (INDEX_TYPE(1));
-
-    LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                                 THREAD_X_POLICY, TEAM_X_POLICY>
-    (INDEX_TYPE(2));
-
+                                 THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(1));
 
+  LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                                 THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(2));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest,
-                            RangeSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest, RangeSegmentTeams);
 
 #endif  // __TEST_LAUNCH_NESTED_TILE_DIRECT_HPP__
diff --git a/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp b/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp
index 31adc84810..d39a66009d 100644
--- a/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp
+++ b/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp
@@ -10,26 +10,29 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY, typename TEAM_X_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY,
+          typename TEAM_X_POLICY>
 void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
 {
 
-  constexpr int tile_size   = 4;
+  constexpr int tile_size = 4;
 
-  //following grid will require loop policies
-  constexpr int threads_x   = 3;
-  constexpr int blocks_x    = 1;
+  // following grid will require loop policies
+  constexpr int threads_x = 3;
+  constexpr int blocks_x  = 1;
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, M*tile_size+1);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, M * tile_size + 1);
 
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
 
-  INDEX_TYPE no_tiles = (N1-1)/tile_size + 1;
+  INDEX_TYPE no_tiles = (N1 - 1) / tile_size + 1;
 
   INDEX_TYPE N = static_cast<INDEX_TYPE>(RAJA::stripIndexType(N1));
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_ttile_array;
   INDEX_TYPE* check_ttile_array;
   INDEX_TYPE* test_ttile_array;
@@ -39,80 +42,89 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
   INDEX_TYPE* test_iloop_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_ttile_array,
-                                     &check_ttile_array,
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res,
+                                     &working_ttile_array, &check_ttile_array,
                                      &test_ttile_array);
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_iloop_array,
-                                     &check_iloop_array,
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res,
+                                     &working_iloop_array, &check_iloop_array,
                                      &test_iloop_array);
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     std::iota(test_ttile_array, test_ttile_array + RAJA::stripIndexType(N), 0);
     std::iota(test_iloop_array, test_iloop_array + RAJA::stripIndexType(N), 0);
 
     RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(threads_x)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::tile_tcount<TEAM_X_POLICY>(
-          ctx, tile_size, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile, INDEX_TYPE bx) {
-            RAJA::loop_icount<THREAD_X_POLICY>(
-              ctx, x_tile, [&](INDEX_TYPE tx, INDEX_TYPE ix) {
-
-                working_ttile_array[tx] = bx;
-                working_iloop_array[tx] = ix;
-
-              }
-            );
-          }
-        );
-      }
-    );
-  } else { // zero-length segment
+        RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(threads_x)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::tile_tcount<TEAM_X_POLICY>(
+              ctx, tile_size, r1,
+              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile,
+                  INDEX_TYPE bx)
+              {
+                RAJA::loop_icount<THREAD_X_POLICY>(
+                    ctx, x_tile,
+                    [&](INDEX_TYPE tx, INDEX_TYPE ix)
+                    {
+                      working_ttile_array[tx] = bx;
+                      working_iloop_array[tx] = ix;
+                    });
+              });
+        });
+  }
+  else
+  {  // zero-length segment
 
-    memset(static_cast<void*>(test_ttile_array), 0, sizeof(INDEX_TYPE) * data_len);
+    memset(static_cast<void*>(test_ttile_array), 0,
+           sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_ttile_array, test_ttile_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_ttile_array, test_ttile_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(blocks_x)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::tile_tcount<TEAM_X_POLICY>
-          (ctx, tile_size, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile, INDEX_TYPE RAJA_UNUSED_ARG(bx)) {
-
-            RAJA::loop_icount<THREAD_X_POLICY>
-              (ctx, x_tile, [&](INDEX_TYPE RAJA_UNUSED_ARG(tx), INDEX_TYPE RAJA_UNUSED_ARG(ix)) {
-
-                working_ttile_array[0]++;
-                working_iloop_array[0]++;
-
-              }
-            );
-          }
-        );
-      }
-    );
+        RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(blocks_x)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::tile_tcount<TEAM_X_POLICY>(
+              ctx, tile_size, r1,
+              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile,
+                  INDEX_TYPE RAJA_UNUSED_ARG(bx))
+              {
+                RAJA::loop_icount<THREAD_X_POLICY>(
+                    ctx, x_tile,
+                    [&](INDEX_TYPE RAJA_UNUSED_ARG(tx),
+                        INDEX_TYPE RAJA_UNUSED_ARG(ix))
+                    {
+                      working_ttile_array[0]++;
+                      working_iloop_array[0]++;
+                    });
+              });
+        });
   }
 
-  working_res.memcpy(check_ttile_array, working_ttile_array, sizeof(INDEX_TYPE) * data_len);
-  working_res.memcpy(check_iloop_array, working_iloop_array, sizeof(INDEX_TYPE) * data_len);
+  working_res.memcpy(check_ttile_array, working_ttile_array,
+                     sizeof(INDEX_TYPE) * data_len);
+  working_res.memcpy(check_iloop_array, working_iloop_array,
+                     sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     INDEX_TYPE idx = 0;
-    for (INDEX_TYPE bx = INDEX_TYPE(0); bx < no_tiles; ++bx) {
-      for (INDEX_TYPE tx = INDEX_TYPE(0); tx < tile_size; ++tx) {
+    for (INDEX_TYPE bx = INDEX_TYPE(0); bx < no_tiles; ++bx)
+    {
+      for (INDEX_TYPE tx = INDEX_TYPE(0); tx < tile_size; ++tx)
+      {
 
-        if(idx >= N1) break;
+        if (idx >= N1) break;
 
         ASSERT_EQ(check_ttile_array[RAJA::stripIndexType(idx)], bx);
         ASSERT_EQ(check_iloop_array[RAJA::stripIndexType(idx)], tx);
@@ -120,31 +132,26 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
         idx++;
       }
     }
-
-  } else {
+  }
+  else
+  {
 
     ASSERT_EQ(check_ttile_array[0], check_ttile_array[0]);
     ASSERT_EQ(check_iloop_array[0], check_iloop_array[0]);
-
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_ttile_array,
-                                       check_ttile_array,
-                                       test_ttile_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_ttile_array,
+                                       check_ttile_array, test_ttile_array);
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_iloop_array,
-                                       check_iloop_array,
-                                       test_iloop_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_iloop_array,
+                                       check_iloop_array, test_iloop_array);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest);
 template <typename T>
 class LaunchNestedTileLoopTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(LaunchNestedTileLoopTest, RangeSegmentTeams)
@@ -152,31 +159,30 @@ TYPED_TEST_P(LaunchNestedTileLoopTest, RangeSegmentTeams)
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-
-  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
 
+  using TEAM_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using THREAD_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
 
 
   // test zero-length range segment
   LaunchNestedTileLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                           THREAD_X_POLICY, TEAM_X_POLICY>
-    (INDEX_TYPE(0));
+                               THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(0));
 
-  //Keep at one since we are doing a direct thread test
+  // Keep at one since we are doing a direct thread test
   LaunchNestedTileLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                                 THREAD_X_POLICY, TEAM_X_POLICY>
-    (INDEX_TYPE(1));
+                               THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(1));
 
   LaunchNestedTileLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                               THREAD_X_POLICY, TEAM_X_POLICY>
-    (INDEX_TYPE(2));
-
-
+                               THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(2));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest,
-                            RangeSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest, RangeSegmentTeams);
 
 #endif  // __TEST_LAUNCH_NESTED_TILE_DIRECT_HPP__
diff --git a/test/functional/scan/tests/test-scan-Exclusive.hpp b/test/functional/scan/tests/test-scan-Exclusive.hpp
index 43f99f9901..40769cee3a 100644
--- a/test/functional/scan/tests/test-scan-Exclusive.hpp
+++ b/test/functional/scan/tests/test-scan-Exclusive.hpp
@@ -16,8 +16,10 @@ ::testing::AssertionResult check_exclusive(const T* actual,
                                            int N,
                                            T init = OP::identity())
 {
-  for (int i = 0; i < N; ++i) {
-    if (*actual != init) {
+  for (int i = 0; i < N; ++i)
+  {
+    if (*actual != init)
+    {
       return ::testing::AssertionFailure()
              << *actual << " != " << init << " (at index " << i << ")";
     }
@@ -29,24 +31,21 @@ ::testing::AssertionResult check_exclusive(const T* actual,
 }
 
 template <typename EXEC_POLICY, typename WORKING_RES, typename OP_TYPE>
-void ScanExclusiveTestImpl(int N,
-                           typename OP_TYPE::result_type offset =
-                           OP_TYPE::identity())
+void ScanExclusiveTestImpl(
+    int N,
+    typename OP_TYPE::result_type offset = OP_TYPE::identity())
 {
   using T = typename OP_TYPE::result_type;
 
-  WORKING_RES res{WORKING_RES::get_default()};
-  camp::resources::Resource working_res{res};
+  WORKING_RES res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res {res};
 
   T* work_in;
   T* work_out;
   T* host_in;
   T* host_out;
 
-  allocScanTestData(N,
-                    working_res,
-                    &work_in, &work_out,
-                    &host_in, &host_out);
+  allocScanTestData(N, working_res, &work_in, &work_out, &host_in, &host_out);
 
   std::iota(host_in, host_in + N, 1);
 
@@ -54,10 +53,9 @@ void ScanExclusiveTestImpl(int N,
   res.memcpy(work_in, host_in, sizeof(T) * N);
   res.wait();
 
-  RAJA::exclusive_scan<EXEC_POLICY>(RAJA::make_span(static_cast<const T*>(work_in), N),
-                                    RAJA::make_span(work_out, N),
-                                    OP_TYPE{},
-                                    offset);
+  RAJA::exclusive_scan<EXEC_POLICY>(
+      RAJA::make_span(static_cast<const T*>(work_in), N),
+      RAJA::make_span(work_out, N), OP_TYPE {}, offset);
 
   res.memcpy(host_out, work_out, sizeof(T) * N);
   res.wait();
@@ -67,28 +65,23 @@ void ScanExclusiveTestImpl(int N,
   // test interface with resource
   res.memcpy(work_in, host_in, sizeof(T) * N);
 
-  RAJA::exclusive_scan<EXEC_POLICY>(res,
-                                    RAJA::make_span(static_cast<const T*>(work_in), N),
-                                    RAJA::make_span(work_out, N),
-                                    OP_TYPE{},
-                                    offset);
+  RAJA::exclusive_scan<EXEC_POLICY>(
+      res, RAJA::make_span(static_cast<const T*>(work_in), N),
+      RAJA::make_span(work_out, N), OP_TYPE {}, offset);
 
   res.memcpy(host_out, work_out, sizeof(T) * N);
   res.wait();
 
   ASSERT_TRUE(check_exclusive<OP_TYPE>(host_out, host_in, N, offset));
 
-  deallocScanTestData(working_res,
-                      work_in, work_out,
-                      host_in, host_out);
+  deallocScanTestData(working_res, work_in, work_out, host_in, host_out);
 }
 
 
 TYPED_TEST_SUITE_P(ScanExclusiveTest);
 template <typename T>
 class ScanExclusiveTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ScanExclusiveTest, ScanExclusive)
 {
@@ -96,33 +89,20 @@ TYPED_TEST_P(ScanExclusiveTest, ScanExclusive)
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
   using OP_TYPE          = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  ScanExclusiveTestImpl<EXEC_POLICY,
-                              WORKING_RESOURCE,
-                              OP_TYPE>(0);
-  ScanExclusiveTestImpl<EXEC_POLICY,
-                              WORKING_RESOURCE,
-                              OP_TYPE>(357);
-  ScanExclusiveTestImpl<EXEC_POLICY,
-                              WORKING_RESOURCE,
-                              OP_TYPE>(32000);
+  ScanExclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0);
+  ScanExclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357);
+  ScanExclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(32000);
 
   //
   // Perform some non-identity offset tests
   //
   using T = typename OP_TYPE::result_type;
 
-  ScanExclusiveTestImpl<EXEC_POLICY,
-                        WORKING_RESOURCE,
-                        OP_TYPE>(0, T(13));
-  ScanExclusiveTestImpl<EXEC_POLICY,
-                        WORKING_RESOURCE,
-                        OP_TYPE>(357, T(15));
-  ScanExclusiveTestImpl<EXEC_POLICY,
-                        WORKING_RESOURCE,
-                        OP_TYPE>(32000, T(2));
+  ScanExclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0, T(13));
+  ScanExclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357, T(15));
+  ScanExclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(32000, T(2));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ScanExclusiveTest,
-                            ScanExclusive);
+REGISTER_TYPED_TEST_SUITE_P(ScanExclusiveTest, ScanExclusive);
 
-#endif // __TEST_SCAN_EXCLUSIVE_HPP__
+#endif  // __TEST_SCAN_EXCLUSIVE_HPP__
diff --git a/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp b/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp
index c42e9a8677..34d7b6d470 100644
--- a/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp
+++ b/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp
@@ -16,8 +16,10 @@ ::testing::AssertionResult check_exclusive(const T* actual,
                                            int N,
                                            T init = OP::identity())
 {
-  for (int i = 0; i < N; ++i) {
-    if (*actual != init) {
+  for (int i = 0; i < N; ++i)
+  {
+    if (*actual != init)
+    {
       return ::testing::AssertionFailure()
              << *actual << " != " << init << " (at index " << i << ")";
     }
@@ -29,24 +31,21 @@ ::testing::AssertionResult check_exclusive(const T* actual,
 }
 
 template <typename EXEC_POLICY, typename WORKING_RES, typename OP_TYPE>
-void ScanExclusiveInplaceTestImpl(int N,
-                                  typename OP_TYPE::result_type offset =
-                                  OP_TYPE::identity())
+void ScanExclusiveInplaceTestImpl(
+    int N,
+    typename OP_TYPE::result_type offset = OP_TYPE::identity())
 {
   using T = typename OP_TYPE::result_type;
 
-  WORKING_RES res{WORKING_RES::get_default()};
-  camp::resources::Resource working_res{res};
+  WORKING_RES res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res {res};
 
   T* work_in;
   T* work_out;
   T* host_in;
   T* host_out;
 
-  allocScanTestData(N,
-                    working_res,
-                    &work_in, &work_out,
-                    &host_in, &host_out);
+  allocScanTestData(N, working_res, &work_in, &work_out, &host_in, &host_out);
 
   std::iota(host_in, host_in + N, 1);
 
@@ -55,8 +54,7 @@ void ScanExclusiveInplaceTestImpl(int N,
   res.wait();
 
   RAJA::exclusive_scan_inplace<EXEC_POLICY>(RAJA::make_span(work_in, N),
-                                            OP_TYPE{},
-                                            offset);
+                                            OP_TYPE {}, offset);
 
   res.memcpy(host_out, work_in, sizeof(T) * N);
   res.wait();
@@ -66,27 +64,22 @@ void ScanExclusiveInplaceTestImpl(int N,
   // test interface with resource
   res.memcpy(work_in, host_in, sizeof(T) * N);
 
-  RAJA::exclusive_scan_inplace<EXEC_POLICY>(res,
-                                            RAJA::make_span(work_in, N),
-                                            OP_TYPE{},
-                                            offset);
+  RAJA::exclusive_scan_inplace<EXEC_POLICY>(res, RAJA::make_span(work_in, N),
+                                            OP_TYPE {}, offset);
 
   res.memcpy(host_out, work_in, sizeof(T) * N);
   res.wait();
 
   ASSERT_TRUE(check_exclusive<OP_TYPE>(host_out, host_in, N, offset));
 
-  deallocScanTestData(working_res,
-                      work_in, work_out,
-                      host_in, host_out);
+  deallocScanTestData(working_res, work_in, work_out, host_in, host_out);
 }
 
 
 TYPED_TEST_SUITE_P(ScanExclusiveInplaceTest);
 template <typename T>
 class ScanExclusiveInplaceTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ScanExclusiveInplaceTest, ScanExclusiveInplace)
 {
@@ -94,33 +87,23 @@ TYPED_TEST_P(ScanExclusiveInplaceTest, ScanExclusiveInplace)
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
   using OP_TYPE          = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY,
-                               WORKING_RESOURCE,
-                               OP_TYPE>(0);
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY,
-                               WORKING_RESOURCE,
-                               OP_TYPE>(357);
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY,
-                               WORKING_RESOURCE,
-                               OP_TYPE>(32000);
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0);
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357);
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(32000);
 
   //
   // Perform some non-identity offset tests
   //
   using T = typename OP_TYPE::result_type;
 
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY,
-                               WORKING_RESOURCE,
-                               OP_TYPE>(0, T(13));
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY,
-                               WORKING_RESOURCE,
-                               OP_TYPE>(357, T(15));
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY,
-                               WORKING_RESOURCE,
-                               OP_TYPE>(32000, T(2));
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0,
+                                                                       T(13));
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357,
+                                                                       T(15));
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(32000,
+                                                                       T(2));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ScanExclusiveInplaceTest,
-                            ScanExclusiveInplace);
+REGISTER_TYPED_TEST_SUITE_P(ScanExclusiveInplaceTest, ScanExclusiveInplace);
 
-#endif // __TEST_SCAN_EXCLUSIVE_INPLACE_HPP__
+#endif  // __TEST_SCAN_EXCLUSIVE_INPLACE_HPP__
diff --git a/test/functional/scan/tests/test-scan-Inclusive.hpp b/test/functional/scan/tests/test-scan-Inclusive.hpp
index 9fcc54ed67..43c0c8e1b2 100644
--- a/test/functional/scan/tests/test-scan-Inclusive.hpp
+++ b/test/functional/scan/tests/test-scan-Inclusive.hpp
@@ -11,15 +11,17 @@
 #include <numeric>
 
 template <typename OP>
-::testing::AssertionResult check_inclusive(
-  const typename OP::result_type* actual,
-  const typename OP::result_type* original,
-  int N)
+::testing::AssertionResult
+check_inclusive(const typename OP::result_type* actual,
+                const typename OP::result_type* original,
+                int N)
 {
   typename OP::result_type init = OP::identity();
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     init = OP()(init, *original);
-    if (*actual != init) {
+    if (*actual != init)
+    {
       return ::testing::AssertionFailure()
              << *actual << " != " << init << " (at index " << i << ")";
     }
@@ -34,18 +36,15 @@ void ScanInclusiveTestImpl(int N)
 {
   using T = typename OP_TYPE::result_type;
 
-  WORKING_RES res{WORKING_RES::get_default()};
-  camp::resources::Resource working_res{res};
+  WORKING_RES res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res {res};
 
   T* work_in;
   T* work_out;
   T* host_in;
   T* host_out;
 
-  allocScanTestData(N,
-                    working_res,
-                    &work_in, &work_out,
-                    &host_in, &host_out);
+  allocScanTestData(N, working_res, &work_in, &work_out, &host_in, &host_out);
 
   std::iota(host_in, host_in + N, 1);
 
@@ -53,9 +52,9 @@ void ScanInclusiveTestImpl(int N)
   res.memcpy(work_in, host_in, sizeof(T) * N);
   res.wait();
 
-  RAJA::inclusive_scan<EXEC_POLICY>(RAJA::make_span(static_cast<const T*>(work_in), N),
-                                    RAJA::make_span(work_out, N),
-                                    OP_TYPE{});
+  RAJA::inclusive_scan<EXEC_POLICY>(
+      RAJA::make_span(static_cast<const T*>(work_in), N),
+      RAJA::make_span(work_out, N), OP_TYPE {});
 
   res.memcpy(host_out, work_out, sizeof(T) * N);
   res.wait();
@@ -65,27 +64,23 @@ void ScanInclusiveTestImpl(int N)
   // test interface with resource
   res.memcpy(work_in, host_in, sizeof(T) * N);
 
-  RAJA::inclusive_scan<EXEC_POLICY>(res,
-                                    RAJA::make_span(static_cast<const T*>(work_in), N),
-                                    RAJA::make_span(work_out, N),
-                                    OP_TYPE{});
+  RAJA::inclusive_scan<EXEC_POLICY>(
+      res, RAJA::make_span(static_cast<const T*>(work_in), N),
+      RAJA::make_span(work_out, N), OP_TYPE {});
 
   res.memcpy(host_out, work_out, sizeof(T) * N);
   res.wait();
 
   ASSERT_TRUE(check_inclusive<OP_TYPE>(host_out, host_in, N));
 
-  deallocScanTestData(working_res,
-                      work_in, work_out,
-                      host_in, host_out);
+  deallocScanTestData(working_res, work_in, work_out, host_in, host_out);
 }
 
 
 TYPED_TEST_SUITE_P(ScanInclusiveTest);
 template <typename T>
 class ScanInclusiveTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ScanInclusiveTest, ScanInclusive)
 {
@@ -93,18 +88,11 @@ TYPED_TEST_P(ScanInclusiveTest, ScanInclusive)
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
   using OP_TYPE          = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  ScanInclusiveTestImpl<EXEC_POLICY,
-                        WORKING_RESOURCE,
-                        OP_TYPE>(0);
-  ScanInclusiveTestImpl<EXEC_POLICY,
-                        WORKING_RESOURCE,
-                        OP_TYPE>(357);
-  ScanInclusiveTestImpl<EXEC_POLICY,
-                        WORKING_RESOURCE,
-                        OP_TYPE>(32000);
+  ScanInclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0);
+  ScanInclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357);
+  ScanInclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(32000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ScanInclusiveTest,
-                            ScanInclusive);
+REGISTER_TYPED_TEST_SUITE_P(ScanInclusiveTest, ScanInclusive);
 
-#endif // __TEST_SCAN_INCLUSIVE_HPP__
+#endif  // __TEST_SCAN_INCLUSIVE_HPP__
diff --git a/test/functional/scan/tests/test-scan-InclusiveInplace.hpp b/test/functional/scan/tests/test-scan-InclusiveInplace.hpp
index 8e4d8e93bf..8f3761865b 100644
--- a/test/functional/scan/tests/test-scan-InclusiveInplace.hpp
+++ b/test/functional/scan/tests/test-scan-InclusiveInplace.hpp
@@ -11,15 +11,17 @@
 #include <numeric>
 
 template <typename OP>
-::testing::AssertionResult check_inclusive(
-  const typename OP::result_type* actual,
-  const typename OP::result_type* original,
-  int N)
+::testing::AssertionResult
+check_inclusive(const typename OP::result_type* actual,
+                const typename OP::result_type* original,
+                int N)
 {
   typename OP::result_type init = OP::identity();
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     init = OP()(init, *original);
-    if (*actual != init) {
+    if (*actual != init)
+    {
       return ::testing::AssertionFailure()
              << *actual << " != " << init << " (at index " << i << ")";
     }
@@ -34,18 +36,15 @@ void ScanInclusiveInplaceTestImpl(int N)
 {
   using T = typename OP_TYPE::result_type;
 
-  WORKING_RES res{WORKING_RES::get_default()};
-  camp::resources::Resource working_res{res};
+  WORKING_RES res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res {res};
 
   T* work_in;
   T* work_out;
   T* host_in;
   T* host_out;
 
-  allocScanTestData(N,
-                    working_res,
-                    &work_in, &work_out,
-                    &host_in, &host_out);
+  allocScanTestData(N, working_res, &work_in, &work_out, &host_in, &host_out);
 
   std::iota(host_in, host_in + N, 1);
 
@@ -54,7 +53,7 @@ void ScanInclusiveInplaceTestImpl(int N)
   res.wait();
 
   RAJA::inclusive_scan_inplace<EXEC_POLICY>(RAJA::make_span(work_in, N),
-                                            OP_TYPE{});
+                                            OP_TYPE {});
 
   res.memcpy(host_out, work_in, sizeof(T) * N);
   res.wait();
@@ -64,26 +63,22 @@ void ScanInclusiveInplaceTestImpl(int N)
   // test interface with resource
   res.memcpy(work_in, host_in, sizeof(T) * N);
 
-  RAJA::inclusive_scan_inplace<EXEC_POLICY>(res,
-                                            RAJA::make_span(work_in, N),
-                                            OP_TYPE{});
+  RAJA::inclusive_scan_inplace<EXEC_POLICY>(res, RAJA::make_span(work_in, N),
+                                            OP_TYPE {});
 
   res.memcpy(host_out, work_in, sizeof(T) * N);
   res.wait();
 
   ASSERT_TRUE(check_inclusive<OP_TYPE>(host_out, host_in, N));
 
-  deallocScanTestData(working_res,
-                      work_in, work_out,
-                      host_in, host_out);
+  deallocScanTestData(working_res, work_in, work_out, host_in, host_out);
 }
 
 
 TYPED_TEST_SUITE_P(ScanInclusiveInplaceTest);
 template <typename T>
 class ScanInclusiveInplaceTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ScanInclusiveInplaceTest, ScanInclusiveInplace)
 {
@@ -91,18 +86,11 @@ TYPED_TEST_P(ScanInclusiveInplaceTest, ScanInclusiveInplace)
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
   using OP_TYPE          = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  ScanInclusiveInplaceTestImpl<EXEC_POLICY,
-                               WORKING_RESOURCE,
-                               OP_TYPE>(0);
-  ScanInclusiveInplaceTestImpl<EXEC_POLICY,
-                               WORKING_RESOURCE,
-                               OP_TYPE>(357);
-  ScanInclusiveInplaceTestImpl<EXEC_POLICY,
-                               WORKING_RESOURCE,
-                               OP_TYPE>(32000);
+  ScanInclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0);
+  ScanInclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357);
+  ScanInclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(32000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ScanInclusiveInplaceTest,
-                            ScanInclusiveInplace);
+REGISTER_TYPED_TEST_SUITE_P(ScanInclusiveInplaceTest, ScanInclusiveInplace);
 
-#endif // __TEST_SCAN_INCLUSIVE_INPLACE_HPP__
+#endif  // __TEST_SCAN_INCLUSIVE_INPLACE_HPP__
diff --git a/test/functional/scan/tests/test-scan-data.hpp b/test/functional/scan/tests/test-scan-data.hpp
index ccfdb47dc2..26b015939f 100644
--- a/test/functional/scan/tests/test-scan-data.hpp
+++ b/test/functional/scan/tests/test-scan-data.hpp
@@ -15,10 +15,12 @@
 template <typename T>
 void allocScanTestData(int N,
                        camp::resources::Resource work_res,
-                       T** work_in, T** work_out,
-                       T** host_in, T** host_out)
+                       T** work_in,
+                       T** work_out,
+                       T** host_in,
+                       T** host_out)
 {
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
   *work_in  = work_res.allocate<T>(N);
   *work_out = work_res.allocate<T>(N);
@@ -29,10 +31,12 @@ void allocScanTestData(int N,
 
 template <typename T>
 void deallocScanTestData(camp::resources::Resource work_res,
-                         T* work_in, T* work_out,
-                         T* host_in, T* host_out)
+                         T* work_in,
+                         T* work_out,
+                         T* host_in,
+                         T* host_out)
 {
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
   work_res.deallocate(work_in);
   work_res.deallocate(work_out);
@@ -40,4 +44,4 @@ void deallocScanTestData(camp::resources::Resource work_res,
   host_res.deallocate(host_out);
 }
 
-#endif // __TEST_SCAN_DATA_HPP__
+#endif  // __TEST_SCAN_DATA_HPP__
diff --git a/test/functional/tensor/matrix/test-tensor-matrix-double.hpp b/test/functional/tensor/matrix/test-tensor-matrix-double.hpp
index 93d08d99f8..4457687cae 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix-double.hpp
+++ b/test/functional/tensor/matrix/test-tensor-matrix-double.hpp
@@ -14,50 +14,122 @@ using MatrixElementType = double;
 using TensorMatrixTypes = ::testing::Types<
 
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::cuda_warp_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::cuda_warp_register>,
 #endif
 
 #ifdef RAJA_ENABLE_HIP
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::expt::hip_wave_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::hip_wave_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::expt::hip_wave_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   8,
+                                   RAJA::expt::hip_wave_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::hip_wave_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   16,
+                                   RAJA::expt::hip_wave_register>,
 #endif
 
 
-//#ifdef __AVX__
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,4, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,8, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,2, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,2, RAJA::expt::avx_register>,
+// #ifdef __AVX__
+//     RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//     2,4, RAJA::expt::avx_register>,
+//     RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//     2,8, RAJA::expt::avx_register>,
+//     RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//     4,8, RAJA::expt::avx_register>,
+//     RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//     8,8, RAJA::expt::avx_register>,
+//     RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//     8,4, RAJA::expt::avx_register>,
+//     RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//     4,2, RAJA::expt::avx_register>,
+//     RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//     8,2, RAJA::expt::avx_register>,
 //
-//#endif
+// #endif
 
 
 #ifdef __AVX2__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,4, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,8, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,12, RAJA::expt::avx2_register>,
-
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx2_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx2_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx2_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,2, RAJA::expt::avx2_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,2, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   2,
+                                   4,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   2,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   2,
+                                   12,
+                                   RAJA::expt::avx2_register>,
+
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//    4,8, RAJA::expt::avx2_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//    8,8, RAJA::expt::avx2_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//    8,4, RAJA::expt::avx2_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//    4,2, RAJA::expt::avx2_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//    8,2, RAJA::expt::avx2_register>,
 #endif
 
 
 #ifdef __AVX512__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,4, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   8,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   4,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx512_register>,
 #endif
 
 
@@ -65,7 +137,8 @@ using TensorMatrixTypes = ::testing::Types<
     RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
 
     // Always test the non-vectorized scalar type
-    RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType, RAJA::expt::scalar_register>
-
-  >;
+    RAJA::expt::SquareMatrixRegister<MatrixElementType,
+                                     TensorMatrixLayoutType,
+                                     RAJA::expt::scalar_register>
 
+    >;
diff --git a/test/functional/tensor/matrix/test-tensor-matrix-float.hpp b/test/functional/tensor/matrix/test-tensor-matrix-float.hpp
index 2952fb5f6f..40cb6f67fd 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix-float.hpp
+++ b/test/functional/tensor/matrix/test-tensor-matrix-float.hpp
@@ -14,29 +14,73 @@ using MatrixElementType = float;
 using TensorMatrixTypes = ::testing::Types<
 
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::cuda_warp_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::cuda_warp_register>,
 #endif
 
 
 #ifdef __AVX__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx_register>,
 #endif
 
 
 #ifdef __AVX2__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx2_register>,
 #endif
 
 
 #ifdef __AVX512__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   8,
+                                   RAJA::expt::avx512_register>,
 #endif
 
 
@@ -44,7 +88,8 @@ using TensorMatrixTypes = ::testing::Types<
     RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
 
     // Always test the non-vectorized scalar type
-    RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType, RAJA::expt::scalar_register>
-
-  >;
+    RAJA::expt::SquareMatrixRegister<MatrixElementType,
+                                     TensorMatrixLayoutType,
+                                     RAJA::expt::scalar_register>
 
+    >;
diff --git a/test/functional/tensor/matrix/test-tensor-matrix-int32_t.hpp b/test/functional/tensor/matrix/test-tensor-matrix-int32_t.hpp
index e15729d08a..b3e415abbc 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix-int32_t.hpp
+++ b/test/functional/tensor/matrix/test-tensor-matrix-int32_t.hpp
@@ -14,29 +14,73 @@ using MatrixElementType = int32_t;
 using TensorMatrixTypes = ::testing::Types<
 
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::cuda_warp_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::cuda_warp_register>,
 #endif
 
 
 #ifdef __AVX__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx_register>,
 #endif
 
 
 #ifdef __AVX2__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx2_register>,
 #endif
 
 
 #ifdef __AVX512__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   8,
+                                   RAJA::expt::avx512_register>,
 #endif
 
 
@@ -44,6 +88,8 @@ using TensorMatrixTypes = ::testing::Types<
     RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
 
     // Always test the non-vectorized scalar type
-    RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType, RAJA::expt::scalar_register>
+    RAJA::expt::SquareMatrixRegister<MatrixElementType,
+                                     TensorMatrixLayoutType,
+                                     RAJA::expt::scalar_register>
 
-  >;
+    >;
diff --git a/test/functional/tensor/matrix/test-tensor-matrix-int64_t.hpp b/test/functional/tensor/matrix/test-tensor-matrix-int64_t.hpp
index f91b015b4a..3dca8e44a6 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix-int64_t.hpp
+++ b/test/functional/tensor/matrix/test-tensor-matrix-int64_t.hpp
@@ -14,42 +14,134 @@ using MatrixElementType = int64_t;
 using TensorMatrixTypes = ::testing::Types<
 
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::cuda_warp_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::cuda_warp_register>,
 #endif
 
 
 #ifdef __AVX__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,4, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,8, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,2, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,2, RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   2,
+                                   4,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   2,
+                                   8,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   2,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   2,
+                                   RAJA::expt::avx_register>,
 
 #endif
 
 
 #ifdef __AVX2__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,4, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,8, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,2, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,2, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   2,
+                                   4,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   2,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   2,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   2,
+                                   RAJA::expt::avx2_register>,
 #endif
 
 
 #ifdef __AVX512__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,4, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   8,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   4,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx512_register>,
 #endif
 
 
@@ -57,6 +149,8 @@ using TensorMatrixTypes = ::testing::Types<
     RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
 
     // Always test the non-vectorized scalar type
-    RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType, RAJA::expt::scalar_register>
+    RAJA::expt::SquareMatrixRegister<MatrixElementType,
+                                     TensorMatrixLayoutType,
+                                     RAJA::expt::scalar_register>
 
-  >;
+    >;
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp
index 1ceaf94b18..b16684cfdc 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp
@@ -8,55 +8,62 @@
 #ifndef __TEST_TENSOR_REGISTER_CtorGetSet_HPP__
 #define __TEST_TENSOR_REGISTER_CtorGetSet_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void CtorGetSetImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
   //
   // Allocate Data
   //
-  std::vector<element_t> data1_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
+  std::vector<element_t> data1_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_h(
+      data1_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(
+      data1_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
 
 
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
-
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(
+      data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_d(
+      data2_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
 
 
   //
   // Do Operation: broadcast-ctor and copy-ctor
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    // create a matrix that contains all 3's
-    matrix_t m1(element_t(3));
-
-    // copy to another matrix
-    matrix_t m2(m1);
-
-    // write out both matrices
-    for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-      for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-        data1_d(i,j) = m1.get(i,j);
-        data2_d(i,j) = m2.get(i,j);
-      }
-    }
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // create a matrix that contains all 3's
+        matrix_t m1(element_t(3));
+
+        // copy to another matrix
+        matrix_t m2(m1);
+
+        // write out both matrices
+        for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+        {
+          for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+          {
+            data1_d(i, j) = m1.get(i, j);
+            data2_d(i, j) = m2.get(i, j);
+          }
+        }
+      });
 
   // copy data back to host
   tensor_copy_to_host<policy_t>(data1_vec, data1_ptr);
@@ -66,11 +73,14 @@ void CtorGetSetImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-      ASSERT_SCALAR_EQ(3, data1_h(i,j));
-      ASSERT_SCALAR_EQ(3, data2_h(i,j));
-//      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
+  for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+    {
+      ASSERT_SCALAR_EQ(3, data1_h(i, j));
+      ASSERT_SCALAR_EQ(3, data2_h(i, j));
+      //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+      //      data2(i,j));
     }
   }
 
@@ -80,15 +90,10 @@ void CtorGetSetImpl()
   //
   tensor_free<policy_t>(data1_ptr);
   tensor_free<policy_t>(data2_ptr);
-
 }
 
 
-
-TYPED_TEST_P(TestTensorMatrix, CtorGetSet)
-{
-  CtorGetSetImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorMatrix, CtorGetSet) { CtorGetSetImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp
index e4e1ff0bfb..1a28374569 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp
@@ -8,71 +8,77 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_Add_HPP__
 #define __TEST_TENSOR_MATRIX_ET_Add_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_AddImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
-  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
+  static constexpr camp::idx_t N =
+      RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns) * 2;
 
   //
   // Allocate Row-Major Data
   //
 
   // alloc data1
-  std::vector<element_t> data1_vec(N*N);
+  std::vector<element_t> data1_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), N, N);
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr,  N, N);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, N, N);
 
 
   // alloc data2
-  std::vector<element_t> data2_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(),  N, N);
+  std::vector<element_t> data2_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), N, N);
 
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr,  N, N);
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr, N, N);
 
 
   // alloc data3 with StaticLayout
-  std::vector<element_t> data3_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_h(data3_vec.data());
+  std::vector<element_t> data3_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data3_h(
+      data3_vec.data());
 
-  element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_d(data3_ptr);
+  element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data3_d(
+      data3_ptr);
 
 
   // alloc data4 with StaticLayout
-  std::vector<element_t> data4_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_h(data4_vec.data());
+  std::vector<element_t> data4_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data4_h(
+      data4_vec.data());
 
-  element_t *data4_ptr = tensor_malloc<policy_t>(data4_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_d(data4_ptr);
+  element_t* data4_ptr = tensor_malloc<policy_t>(data4_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data4_d(
+      data4_ptr);
 
 
   // alloc data5
-  std::vector<element_t> data5_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(),  N, N);
-
-  element_t *data5_ptr = tensor_malloc<policy_t>(data5_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr,  N, N);
+  std::vector<element_t> data5_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(), N, N);
 
+  element_t* data5_ptr = tensor_malloc<policy_t>(data5_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr, N, N);
 
 
   // Fill data1, data2, data3, and data4
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      data1_h(i,j) = i*matrix_t::s_num_columns+j;
-      data2_h(i,j) = 1+i+j;
-      data3_h(i,j) = i*matrix_t::s_num_columns+j;
-      data4_h(i,j) = 1+i+j;
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      data1_h(i, j) = i * matrix_t::s_num_columns + j;
+      data2_h(i, j) = 1 + i + j;
+      data3_h(i, j) = i * matrix_t::s_num_columns + j;
+      data4_h(i, j) = 1 + i + j;
     }
   }
 
@@ -85,26 +91,29 @@ void ET_AddImpl()
   //
   // Do Operation: Full sum of data1, data2, data3, and data4
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
-    auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
 
-    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+        auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,N>();
-    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,N>();
+        auto SRrows =
+            RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
+        auto SRcols =
+            RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
 
-    // Access types:
-    // data1_d - Layout with all() and all().
-    // data2_d - Layout with all() and static_range(), which should default to normal Layout access.
-    // data3_d - StaticLayout with static_all() and static_range().
-    // data4_d - StaticLayout with static_all() and all().
-
-    data5_d(cols, rows) = data1_d(rows, cols) + data2_d(cols, SRrows) + data3_d(SArows, SRcols) + data4_d(SAcols, rows);
+        // Access types:
+        // data1_d - Layout with all() and all().
+        // data2_d - Layout with all() and static_range(), which should default
+        // to normal Layout access. data3_d - StaticLayout with static_all() and
+        // static_range(). data4_d - StaticLayout with static_all() and all().
 
-  });
+        data5_d(cols, rows) = data1_d(rows, cols) + data2_d(cols, SRrows) +
+                              data3_d(SArows, SRcols) + data4_d(SAcols, rows);
+      });
 
   tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
@@ -112,27 +121,34 @@ void ET_AddImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)+data2_h(j,i)+data3_h(i,j)+data4_h(j,i));
-//      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) + data2_h(j, i) +
+                                          data3_h(i, j) + data4_h(j, i));
+      //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+      //      data2(i,j));
     }
   }
 
 
-
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for(camp::idx_t n_size = 0;n_size <= N; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= N; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= N; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data5
       //
-      for(camp::idx_t i = 0;i < N; ++ i){
-        for(camp::idx_t j = 0;j < N; ++ j){
-          data5_h(j,i) = -1;
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
+        for (camp::idx_t j = 0; j < N; ++j)
+        {
+          data5_h(j, i) = -1;
         }
       }
       tensor_copy_to_device<policy_t>(data5_ptr, data5_vec);
@@ -141,16 +157,19 @@ void ET_AddImpl()
       //
       // Do Operation: Perform partial sum
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-        // Load data using a View
-        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
-        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            // Load data using a View
+            auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+            auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
-        // Access types:
-        // Layout with range() and range() because loop iterate cannot be determined statically.
+            // Access types:
+            // Layout with range() and range() because loop iterate cannot be
+            // determined statically.
 
-        data5_d(cols, rows) = data1_d(rows, cols) + data2_d(cols, rows);
-      });
+            data5_d(cols, rows) = data1_d(rows, cols) + data2_d(cols, rows);
+          });
 
       tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
@@ -158,19 +177,22 @@ void ET_AddImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < N; ++ i){
-        for(camp::idx_t j = 0;j < N; ++ j){
-//          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
-          if(i < n_size && j < m_size){
-            ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)+data2_h(j,i));
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
+        for (camp::idx_t j = 0; j < N; ++j)
+        {
+          //          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+          //          data2(i,j));
+          if (i < n_size && j < m_size)
+          {
+            ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) + data2_h(j, i));
           }
-          else{
-            ASSERT_SCALAR_EQ(element_t(-1), data5_h(j,i));
+          else
+          {
+            ASSERT_SCALAR_EQ(element_t(-1), data5_h(j, i));
           }
         }
       }
-
-
     }
   }
 
@@ -186,11 +208,7 @@ void ET_AddImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorMatrix, ET_Add)
-{
-  ET_AddImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorMatrix, ET_Add) { ET_AddImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp
index a06b87732c..c17692d673 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp
@@ -8,17 +8,18 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_Divide_HPP__
 #define __TEST_TENSOR_MATRIX_ET_Divide_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_DivideImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
-  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
+  static constexpr camp::idx_t N =
+      RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns) * 2;
 
   //
   // Allocate Row-Major Data
@@ -26,54 +27,59 @@ void ET_DivideImpl()
 
   // alloc data1
 
-  std::vector<element_t> data1_vec(N*N);
+  std::vector<element_t> data1_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), N, N);
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr,  N, N);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, N, N);
 
 
   // alloc data2
 
-  std::vector<element_t> data2_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(),  N, N);
+  std::vector<element_t> data2_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), N, N);
 
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr,  N, N);
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr, N, N);
 
 
   // alloc data3 with StaticLayout
-  std::vector<element_t> data3_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_h(data3_vec.data());
+  std::vector<element_t> data3_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data3_h(
+      data3_vec.data());
 
-  element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_d(data3_ptr);
+  element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data3_d(
+      data3_ptr);
 
 
   // alloc data4 with StaticLayout
-  std::vector<element_t> data4_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_h(data4_vec.data());
+  std::vector<element_t> data4_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data4_h(
+      data4_vec.data());
 
-  element_t *data4_ptr = tensor_malloc<policy_t>(data4_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_d(data4_ptr);
+  element_t* data4_ptr = tensor_malloc<policy_t>(data4_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data4_d(
+      data4_ptr);
 
 
   // alloc data5
-  std::vector<element_t> data5_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(),  N, N);
-
-  element_t *data5_ptr = tensor_malloc<policy_t>(data5_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr,  N, N);
+  std::vector<element_t> data5_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(), N, N);
 
+  element_t* data5_ptr = tensor_malloc<policy_t>(data5_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr, N, N);
 
 
   // Fill data1, data2, data3, and data4
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      data1_h(i,j) = i*matrix_t::s_num_columns+j;
-      data2_h(i,j) = 1+i+j;
-      data3_h(i,j) = i*matrix_t::s_num_columns+j;
-      data4_h(i,j) = 1+i+j;
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      data1_h(i, j) = i * matrix_t::s_num_columns + j;
+      data2_h(i, j) = 1 + i + j;
+      data3_h(i, j) = i * matrix_t::s_num_columns + j;
+      data4_h(i, j) = 1 + i + j;
     }
   }
 
@@ -86,26 +92,29 @@ void ET_DivideImpl()
   //
   // Do Operation: Full sum of data1, data2, data3, and data4
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
-    auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
 
-    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+        auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,N>();
-    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,N>();
+        auto SRrows =
+            RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
+        auto SRcols =
+            RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
 
-    // Access types:
-    // data1_d - Layout with all() and all().
-    // data2_d - Layout with all() and static_range(), which should default to normal Layout access.
-    // data3_d - StaticLayout with static_all() and static_range().
-    // data4_d - StaticLayout with static_all() and all().
-
-    data5_d(cols, rows) = data1_d(rows, cols) / data2_d(cols, SRrows) + data3_d(SArows, SRcols) / data4_d(SAcols, rows);
+        // Access types:
+        // data1_d - Layout with all() and all().
+        // data2_d - Layout with all() and static_range(), which should default
+        // to normal Layout access. data3_d - StaticLayout with static_all() and
+        // static_range(). data4_d - StaticLayout with static_all() and all().
 
-  });
+        data5_d(cols, rows) = data1_d(rows, cols) / data2_d(cols, SRrows) +
+                              data3_d(SArows, SRcols) / data4_d(SAcols, rows);
+      });
 
   tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
@@ -113,27 +122,34 @@ void ET_DivideImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)/data2_h(j,i)+data3_h(i,j)/data4_h(j,i));
-//      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) / data2_h(j, i) +
+                                          data3_h(i, j) / data4_h(j, i));
+      //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+      //      data2(i,j));
     }
   }
 
 
-
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for(camp::idx_t n_size = 0;n_size <= N; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= N; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= N; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data5
       //
-      for(camp::idx_t i = 0;i < N; ++ i){
-        for(camp::idx_t j = 0;j < N; ++ j){
-          data5_h(j,i) = -1;
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
+        for (camp::idx_t j = 0; j < N; ++j)
+        {
+          data5_h(j, i) = -1;
         }
       }
       tensor_copy_to_device<policy_t>(data5_ptr, data5_vec);
@@ -142,16 +158,19 @@ void ET_DivideImpl()
       //
       // Do Operation: Perform partial sum
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-        // Load data using a View
-        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
-        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            // Load data using a View
+            auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+            auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
-        // Access types:
-        // Layout with range() and range() because loop iterate cannot be determined statically.
+            // Access types:
+            // Layout with range() and range() because loop iterate cannot be
+            // determined statically.
 
-        data5_d(cols, rows) = data1_d(rows, cols) / data2_d(cols, rows);
-      });
+            data5_d(cols, rows) = data1_d(rows, cols) / data2_d(cols, rows);
+          });
 
       tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
@@ -159,19 +178,22 @@ void ET_DivideImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < N; ++ i){
-        for(camp::idx_t j = 0;j < N; ++ j){
-//          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
-          if(i < n_size && j < m_size){
-            ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)/data2_h(j,i));
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
+        for (camp::idx_t j = 0; j < N; ++j)
+        {
+          //          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+          //          data2(i,j));
+          if (i < n_size && j < m_size)
+          {
+            ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) / data2_h(j, i));
           }
-          else{
-            ASSERT_SCALAR_EQ(element_t(-1), data5_h(j,i));
+          else
+          {
+            ASSERT_SCALAR_EQ(element_t(-1), data5_h(j, i));
           }
         }
       }
-
-
     }
   }
 
@@ -184,15 +206,10 @@ void ET_DivideImpl()
   tensor_free<policy_t>(data3_ptr);
   tensor_free<policy_t>(data4_ptr);
   tensor_free<policy_t>(data5_ptr);
-
 }
 
 
-
-TYPED_TEST_P(TestTensorMatrix, ET_Divide)
-{
-  ET_DivideImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorMatrix, ET_Divide) { ET_DivideImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp
index 1d1c725f52..6c9638a779 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_REGISTER_ET_LoadStore_HPP__
 #define __TEST_TENSOR_REGISTER_ET_LoadStore_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_LoadStoreImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
@@ -24,67 +24,91 @@ void ET_LoadStoreImpl()
   //
 
   // alloc data1
-  std::vector<element_t> data1_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
+  std::vector<element_t> data1_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_h(
+      data1_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(
+      data1_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
 
 
   // alloc data2
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(
+      data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_d(
+      data2_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
 
 
   // alloc data3 with StaticLayout
-  std::vector<element_t> data3_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,matrix_t::s_num_rows,matrix_t::s_num_columns>> data3_h(data3_vec.data());
+  std::vector<element_t> data3_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, matrix_t::s_num_rows,
+                                           matrix_t::s_num_columns>>
+      data3_h(data3_vec.data());
 
-  element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,matrix_t::s_num_rows,matrix_t::s_num_columns>> data3_d(data3_ptr);
+  element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, matrix_t::s_num_rows,
+                                           matrix_t::s_num_columns>>
+      data3_d(data3_ptr);
 
 
   // alloc data4
-  std::vector<element_t> data4_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data4_h(data4_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+  std::vector<element_t> data4_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data4_h(
+      data4_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
-  element_t *data4_ptr = tensor_malloc<policy_t>(data4_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data4_d(data4_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
+  element_t* data4_ptr = tensor_malloc<policy_t>(data4_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data4_d(
+      data4_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
 
 
   // alloc data5
-  std::vector<element_t> data5_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+  std::vector<element_t> data5_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_h(
+      data5_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
-  element_t *data5_ptr = tensor_malloc<policy_t>(data5_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
+  element_t* data5_ptr = tensor_malloc<policy_t>(data5_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_d(
+      data5_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
 
 
   // alloc data6
-  std::vector<element_t> data6_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data6_h(data6_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+  std::vector<element_t> data6_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data6_h(
+      data6_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
-  element_t *data6_ptr = tensor_malloc<policy_t>(data6_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data6_d(data6_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
+  element_t* data6_ptr = tensor_malloc<policy_t>(data6_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data6_d(
+      data6_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
 
 
   // alloc data7
-  std::vector<element_t> data7_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data7_h(data7_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
-
-  element_t *data7_ptr = tensor_malloc<policy_t>(data7_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data7_d(data7_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
+  std::vector<element_t> data7_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data7_h(
+      data7_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
+  element_t* data7_ptr = tensor_malloc<policy_t>(data7_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data7_d(
+      data7_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
 
 
   // Fill data
-  for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-      data1_h(i,j) = i*matrix_t::s_num_columns+j;
-      data3_h(i,j) = i*matrix_t::s_num_columns+j;
+  for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+    {
+      data1_h(i, j) = i * matrix_t::s_num_columns + j;
+      data3_h(i, j) = i * matrix_t::s_num_columns + j;
     }
   }
 
@@ -95,25 +119,29 @@ void ET_LoadStoreImpl()
   //
   // Do Operation: Load/Store full matrix from one view to another
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
-    auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
-
-    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
 
-    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,matrix_t::s_num_rows>();
-    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,matrix_t::s_num_columns>();
+        auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-    data2_d(cols, rows) = data1_d(rows, cols);
+        auto SRrows = RAJA::expt::RowIndex<
+            int, matrix_t>::template static_range<0, matrix_t::s_num_rows>();
+        auto SRcols = RAJA::expt::ColIndex<
+            int, matrix_t>::template static_range<0, matrix_t::s_num_columns>();
 
-    data4_d(cols, rows) = data3_d(SArows, SRcols);  // mixed static_all and static_range
-    data5_d(cols, rows) = data3_d(SArows, SAcols);  // static_all
-    data6_d(cols, rows) = data3_d(SRrows, SRcols);  // static_range
-    data7_d(cols, rows) = data3_d(rows, SRcols);    // mixed static_range and non-static
+        data2_d(cols, rows) = data1_d(rows, cols);
 
-  });
+        data4_d(cols, rows) =
+            data3_d(SArows, SRcols);  // mixed static_all and static_range
+        data5_d(cols, rows) = data3_d(SArows, SAcols);  // static_all
+        data6_d(cols, rows) = data3_d(SRrows, SRcols);  // static_range
+        data7_d(cols, rows) =
+            data3_d(rows, SRcols);  // mixed static_range and non-static
+      });
 
   tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
   tensor_copy_to_host<policy_t>(data4_vec, data4_ptr);
@@ -125,14 +153,17 @@ void ET_LoadStoreImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-      //printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j), data2_h(j,i));
-      ASSERT_SCALAR_EQ(data1_h(i,j), data2_h(j,i));
-      ASSERT_SCALAR_EQ(data3_h(i,j), data4_h(j,i));
-      ASSERT_SCALAR_EQ(data3_h(i,j), data5_h(j,i));
-      ASSERT_SCALAR_EQ(data3_h(i,j), data6_h(j,i));
-      ASSERT_SCALAR_EQ(data3_h(i,j), data7_h(j,i));
+  for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+    {
+      // printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j),
+      // data2_h(j,i));
+      ASSERT_SCALAR_EQ(data1_h(i, j), data2_h(j, i));
+      ASSERT_SCALAR_EQ(data3_h(i, j), data4_h(j, i));
+      ASSERT_SCALAR_EQ(data3_h(i, j), data5_h(j, i));
+      ASSERT_SCALAR_EQ(data3_h(i, j), data6_h(j, i));
+      ASSERT_SCALAR_EQ(data3_h(i, j), data7_h(j, i));
     }
   }
 
@@ -140,15 +171,19 @@ void ET_LoadStoreImpl()
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for(camp::idx_t n_size = 0;n_size <= matrix_t::s_num_rows; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= matrix_t::s_num_columns; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= matrix_t::s_num_rows; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= matrix_t::s_num_columns; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data2
       //
-      for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-          data2_h(j,i) = -1;
+      for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+        {
+          data2_h(j, i) = -1;
         }
       }
       tensor_copy_to_device<policy_t>(data2_ptr, data2_vec);
@@ -157,13 +192,15 @@ void ET_LoadStoreImpl()
       //
       // Do Operation: Load/Store partial matrix from one view to another
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-        // Load data using a View
-        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
-        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            // Load data using a View
+            auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+            auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
-        data2_d(cols, rows) = data1_d(rows, cols);
-      });
+            data2_d(cols, rows) = data1_d(rows, cols);
+          });
 
       tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
 
@@ -171,19 +208,22 @@ void ET_LoadStoreImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-//          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
-          if(i < n_size && j < m_size){
-            ASSERT_SCALAR_EQ(data1_h(i,j), data2_h(j,i));
+      for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+        {
+          //          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+          //          data2(i,j));
+          if (i < n_size && j < m_size)
+          {
+            ASSERT_SCALAR_EQ(data1_h(i, j), data2_h(j, i));
           }
-          else{
-            ASSERT_SCALAR_EQ(element_t(-1), data2_h(j,i));
+          else
+          {
+            ASSERT_SCALAR_EQ(element_t(-1), data2_h(j, i));
           }
         }
       }
-
-
     }
   }
 
@@ -201,11 +241,7 @@ void ET_LoadStoreImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorMatrix, ET_LoadStore)
-{
-  ET_LoadStoreImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorMatrix, ET_LoadStore) { ET_LoadStoreImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp
index 4718172de7..c197a306e4 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_MatrixMatrixMultiply_HPP__
 #define __TEST_TENSOR_MATRIX_ET_MatrixMatrixMultiply_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_MatrixMatrixMultiplyImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
@@ -23,67 +23,73 @@ void ET_MatrixMatrixMultiplyImpl()
   using B_matrix_t = typename matrix_t::transpose_type;
   using C_matrix_t = typename matrix_t::product_type;
 
-//  static constexpr camp::idx_t N = 8; //matrix_t::s_num_rows*matrix_t::s_num_columns*2;
-  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns);
+  //  static constexpr camp::idx_t N = 8;
+  //  //matrix_t::s_num_rows*matrix_t::s_num_columns*2;
+  static constexpr camp::idx_t N =
+      RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns);
   //
   // Allocate Row-Major Data
   //
 
   // alloc data1 - The left matrix
 
-  std::vector<element_t> data1_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data1_h(data1_vec.data());
+  std::vector<element_t> data1_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data1_h(
+      data1_vec.data());
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data1_d(data1_ptr);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data1_d(
+      data1_ptr);
 
 
   // alloc data2 - The right matrix
 
-  std::vector<element_t> data2_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data2_h(data2_vec.data());
+  std::vector<element_t> data2_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data2_h(
+      data2_vec.data());
 
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data2_d(data2_ptr);
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data2_d(
+      data2_ptr);
 
 
   // alloc data3 - The result matrix
 
-  std::vector<element_t> data3_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> data3_h(data3_vec.data(),  N, N);
-
-  element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data3_d(data3_ptr,  N, N);
+  std::vector<element_t> data3_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> data3_h(data3_vec.data(), N, N);
 
+  element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data3_d(data3_ptr, N, N);
 
 
   // Fill data1 and data2
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      data1_h(i,j) = 1+i*N+j;
-      data2_h(i,j) = 3+i*N+j;
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      data1_h(i, j) = 1 + i * N + j;
+      data2_h(i, j) = 3 + i * N + j;
     }
-
   }
 
-//  printf("data1:\n");
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("  ");
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%lf  ", (double)data1_h(i,j));
-//    }
-//    printf("\n");
-//  }
-
-
-//  printf("data2:\n");
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("  ");
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%lf  ", (double)data2_h(i,j));
-//    }
-//    printf("\n");
-//  }
+  //  printf("data1:\n");
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("  ");
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%lf  ", (double)data1_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
+
+
+  //  printf("data2:\n");
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("  ");
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%lf  ", (double)data2_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
 
   tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
   tensor_copy_to_device<policy_t>(data2_ptr, data2_vec);
@@ -92,70 +98,82 @@ void ET_MatrixMatrixMultiplyImpl()
   //
   // Do Operation: A*B
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::all();
-    auto A_cols = RAJA::expt::ColIndex<int, A_matrix_t>::template static_range<0,N>();
-
-    auto B_rows = RAJA::expt::RowIndex<int, B_matrix_t>::template static_range<0,N>();
-    auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::static_all();
-
-    auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::all();
-    auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::all();
-
-    data3_d(C_rows, C_cols) = data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::all();
+        auto A_cols =
+            RAJA::expt::ColIndex<int, A_matrix_t>::template static_range<0,
+                                                                         N>();
+
+        auto B_rows =
+            RAJA::expt::RowIndex<int, B_matrix_t>::template static_range<0,
+                                                                         N>();
+        auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::static_all();
+
+        auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::all();
+        auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::all();
+
+        data3_d(C_rows, C_cols) =
+            data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
+      });
 
   tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
-//  printf("data3:\n");
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("  ");
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%lf  ", (double)data3_h(i,j));
-//    }
-//    printf("\n");
-//  }
+  //  printf("data3:\n");
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("  ");
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%lf  ", (double)data3_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
 
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
       element_t expected(0);
-      for(camp::idx_t k = 0;k < N; ++ k){
-        expected += data1_h(i,k)*data2_h(k,j);
+      for (camp::idx_t k = 0; k < N; ++k)
+      {
+        expected += data1_h(i, k) * data2_h(k, j);
       }
-//    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j, (double)expected, (double)data3_h(i,j));
-
-      ASSERT_SCALAR_EQ(expected, data3_h(i,j));
-//      data3_h(i,j) = expected;
+      //    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j,
+      //    (double)expected, (double)data3_h(i,j));
 
+      ASSERT_SCALAR_EQ(expected, data3_h(i, j));
+      //      data3_h(i,j) = expected;
     }
   }
 
-//  printf("expected:\n");
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("  ");
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%lf  ", (double)data3_h(i,j));
-//    }
-//    printf("\n");
-//  }
+  //  printf("expected:\n");
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("  ");
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%lf  ", (double)data3_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
 
 
   //
   // Loop over all possible sub-matrix sizes for A*x
   //
-  for(camp::idx_t n_size = 0;n_size <= N; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= N; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= N; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data3
       //
-      for(camp::idx_t i = 0;i < N; ++ i){
-        for(camp::idx_t j = 0;j < N; ++ j){
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
+        for (camp::idx_t j = 0; j < N; ++j)
+        {
           data3_h(i, j) = 0;
         }
       }
@@ -166,19 +184,27 @@ void ET_MatrixMatrixMultiplyImpl()
       //
       // Do Operation A*B
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-        auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::range(0, n_size);
-        auto A_cols = RAJA::expt::ColIndex<int, A_matrix_t>::range(0, m_size);
-
-        auto B_rows = RAJA::expt::RowIndex<int, B_matrix_t>::range(0, m_size);
-        auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::range(0, n_size);
-
-        auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::range(0, n_size);
-        auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::range(0, n_size);
-
-        data3_d(C_rows, C_cols) = data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
-      });
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            auto A_rows =
+                RAJA::expt::RowIndex<int, A_matrix_t>::range(0, n_size);
+            auto A_cols =
+                RAJA::expt::ColIndex<int, A_matrix_t>::range(0, m_size);
+
+            auto B_rows =
+                RAJA::expt::RowIndex<int, B_matrix_t>::range(0, m_size);
+            auto B_cols =
+                RAJA::expt::ColIndex<int, B_matrix_t>::range(0, n_size);
+
+            auto C_rows =
+                RAJA::expt::RowIndex<int, C_matrix_t>::range(0, n_size);
+            auto C_cols =
+                RAJA::expt::ColIndex<int, C_matrix_t>::range(0, n_size);
+
+            data3_d(C_rows, C_cols) =
+                data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
+          });
 
       tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
@@ -189,37 +215,35 @@ void ET_MatrixMatrixMultiplyImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < n_size; ++ i){
-        for(camp::idx_t j = 0;j < n_size; ++ j){
+      for (camp::idx_t i = 0; i < n_size; ++i)
+      {
+        for (camp::idx_t j = 0; j < n_size; ++j)
+        {
           element_t expected(0);
-          for(camp::idx_t k = 0;k < m_size; ++ k){
-            expected += data1_h(i,k)*data2_h(k,j);
+          for (camp::idx_t k = 0; k < m_size; ++k)
+          {
+            expected += data1_h(i, k) * data2_h(k, j);
           }
-    //    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j, (double)expected, (double)data3_h(i,j));
-
-          ASSERT_SCALAR_EQ(expected, data3_h(i,j));
-    //      data3_h(i,j) = expected;
+          //    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j,
+          //    (double)expected, (double)data3_h(i,j));
 
+          ASSERT_SCALAR_EQ(expected, data3_h(i, j));
+          //      data3_h(i,j) = expected;
         }
       }
-
-
     }
   }
 
 
-
   //
   // Free data
   //
   tensor_free<policy_t>(data1_ptr);
   tensor_free<policy_t>(data2_ptr);
   tensor_free<policy_t>(data3_ptr);
-
 }
 
 
-
 TYPED_TEST_P(TestTensorMatrix, ET_MatrixMatrixMultiply)
 {
   ET_MatrixMatrixMultiplyImpl<TypeParam>();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp
index 8bebe94c26..f8d136d0e7 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp
@@ -8,17 +8,17 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_MatrixMatrixMultiplyAdd_HPP__
 #define __TEST_TENSOR_MATRIX_ET_MatrixMatrixMultiplyAdd_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
-RAJA_INDEX_VALUE( TX, "TX" );
-RAJA_INDEX_VALUE( TY, "TY" );
+RAJA_INDEX_VALUE(TX, "TX");
+RAJA_INDEX_VALUE(TY, "TY");
 
 template <typename MATRIX_TYPE>
 void ET_MatrixMatrixMultiplyAddImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
@@ -26,68 +26,75 @@ void ET_MatrixMatrixMultiplyAddImpl()
   using B_matrix_t = typename matrix_t::transpose_type;
   using C_matrix_t = typename matrix_t::product_type;
 
-//  static constexpr camp::idx_t N = 8; //matrix_t::s_num_rows*matrix_t::s_num_columns*2;
-  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns);
+  //  static constexpr camp::idx_t N = 8;
+  //  //matrix_t::s_num_rows*matrix_t::s_num_columns*2;
+  static constexpr camp::idx_t N =
+      RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns);
   //
   // Allocate Row-Major Data
   //
 
   // alloc data1 - The left matrix
 
-  std::vector<element_t> data1_vec(N*N);
-  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>, TX, TY> data1_h(data1_vec.data());
+  std::vector<element_t> data1_vec(N * N);
+  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>, TX, TY>
+      data1_h(data1_vec.data());
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>, TX, TY> data1_d(data1_ptr);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>, TX, TY>
+      data1_d(data1_ptr);
 
 
   // alloc data2 - The right matrix
 
-  std::vector<element_t> data2_vec(N*N);
-  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>, TX, TY> data2_h(data2_vec.data());
+  std::vector<element_t> data2_vec(N * N);
+  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>, TX, TY>
+      data2_h(data2_vec.data());
 
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>, TX, TY> data2_d(data2_ptr);
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>, TX, TY>
+      data2_d(data2_ptr);
 
 
   // alloc data3 - The result matrix
 
-  std::vector<element_t> data3_vec(N*N);
-  RAJA::TypedView<element_t, RAJA::Layout<2>, TX, TY> data3_h(data3_vec.data(),  N, N);
-
-  element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::TypedView<element_t, RAJA::Layout<2>, TX, TY> data3_d(data3_ptr,  N, N);
+  std::vector<element_t> data3_vec(N * N);
+  RAJA::TypedView<element_t, RAJA::Layout<2>, TX, TY> data3_h(data3_vec.data(),
+                                                              N, N);
 
+  element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
+  RAJA::TypedView<element_t, RAJA::Layout<2>, TX, TY> data3_d(data3_ptr, N, N);
 
 
   // Fill data1 and data2
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      data1_h(i,j) = 1+i*N+j;
-      data2_h(i,j) = 3+i*N+j;
-      data3_h(i,j) = 5*i+13*j;
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      data1_h(i, j) = 1 + i * N + j;
+      data2_h(i, j) = 3 + i * N + j;
+      data3_h(i, j) = 5 * i + 13 * j;
     }
-
   }
 
-//  printf("data1:\n");
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("  ");
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%lf  ", (double)data1_h(i,j));
-//    }
-//    printf("\n");
-//  }
-
-
-//  printf("data2:\n");
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("  ");
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%lf  ", (double)data2_h(i,j));
-//    }
-//    printf("\n");
-//  }
+  //  printf("data1:\n");
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("  ");
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%lf  ", (double)data1_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
+
+
+  //  printf("data2:\n");
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("  ");
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%lf  ", (double)data2_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
 
   tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
   tensor_copy_to_device<policy_t>(data2_ptr, data2_vec);
@@ -97,71 +104,83 @@ void ET_MatrixMatrixMultiplyAddImpl()
   //
   // Do Operation: A*B
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::all();
-    auto A_cols = RAJA::expt::ColIndex<int, A_matrix_t>::template static_range<0,N>();
-
-    auto B_rows = RAJA::expt::RowIndex<int, B_matrix_t>::template static_range<0,N>();
-    auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::static_all();
-
-    auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::all();
-    auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::all();
-
-    data3_d(C_rows, C_cols) += data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::all();
+        auto A_cols =
+            RAJA::expt::ColIndex<int, A_matrix_t>::template static_range<0,
+                                                                         N>();
+
+        auto B_rows =
+            RAJA::expt::RowIndex<int, B_matrix_t>::template static_range<0,
+                                                                         N>();
+        auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::static_all();
+
+        auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::all();
+        auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::all();
+
+        data3_d(C_rows, C_cols) +=
+            data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
+      });
 
   tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
-//  printf("data3:\n");
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("  ");
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%lf  ", (double)data3_h(i,j));
-//    }
-//    printf("\n");
-//  }
+  //  printf("data3:\n");
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("  ");
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%lf  ", (double)data3_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
 
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      element_t expected(5*i+13*j);
-      for(camp::idx_t k = 0;k < N; ++ k){
-        expected += data1_h(i,k)*data2_h(k,j);
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      element_t expected(5 * i + 13 * j);
+      for (camp::idx_t k = 0; k < N; ++k)
+      {
+        expected += data1_h(i, k) * data2_h(k, j);
       }
-//    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j, (double)expected, (double)data3_h(i,j));
-
-      ASSERT_SCALAR_EQ(expected, data3_h(i,j));
-//      data3_h(i,j) = expected;
+      //    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j,
+      //    (double)expected, (double)data3_h(i,j));
 
+      ASSERT_SCALAR_EQ(expected, data3_h(i, j));
+      //      data3_h(i,j) = expected;
     }
   }
 
-//  printf("expected:\n");
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("  ");
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%lf  ", (double)data3_h(i,j));
-//    }
-//    printf("\n");
-//  }
+  //  printf("expected:\n");
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("  ");
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%lf  ", (double)data3_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
 
 
   //
   // Loop over all possible sub-matrix sizes for A*x
   //
-  for(camp::idx_t n_size = 0;n_size <= N; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= N; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= N; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data3
       //
-      for(camp::idx_t i = 0;i < N; ++ i){
-        for(camp::idx_t j = 0;j < N; ++ j){
-          data3_h(i,j) = 5*i+13*j;
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
+        for (camp::idx_t j = 0; j < N; ++j)
+        {
+          data3_h(i, j) = 5 * i + 13 * j;
         }
       }
 
@@ -171,20 +190,28 @@ void ET_MatrixMatrixMultiplyAddImpl()
       //
       // Do Operation A*B
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-        auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::range(0, n_size);
-        auto A_cols = RAJA::expt::ColIndex<int, A_matrix_t>::range(0, m_size);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            auto A_rows =
+                RAJA::expt::RowIndex<int, A_matrix_t>::range(0, n_size);
+            auto A_cols =
+                RAJA::expt::ColIndex<int, A_matrix_t>::range(0, m_size);
 
-        auto B_rows = RAJA::expt::RowIndex<int, B_matrix_t>::range(0, m_size);
-        auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::range(0, n_size);
+            auto B_rows =
+                RAJA::expt::RowIndex<int, B_matrix_t>::range(0, m_size);
+            auto B_cols =
+                RAJA::expt::ColIndex<int, B_matrix_t>::range(0, n_size);
 
-        auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::range(0, n_size);
-        auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::range(0, n_size);
+            auto C_rows =
+                RAJA::expt::RowIndex<int, C_matrix_t>::range(0, n_size);
+            auto C_cols =
+                RAJA::expt::ColIndex<int, C_matrix_t>::range(0, n_size);
 
 
-        data3_d(C_rows, C_cols) += data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
-      });
+            data3_d(C_rows, C_cols) +=
+                data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
+          });
 
       tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
@@ -195,35 +222,34 @@ void ET_MatrixMatrixMultiplyAddImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < n_size; ++ i){
-        for(camp::idx_t j = 0;j < n_size; ++ j){
-          element_t expected(5*i+13*j);
-          for(camp::idx_t k = 0;k < m_size; ++ k){
-            expected += data1_h(i,k)*data2_h(k,j);
+      for (camp::idx_t i = 0; i < n_size; ++i)
+      {
+        for (camp::idx_t j = 0; j < n_size; ++j)
+        {
+          element_t expected(5 * i + 13 * j);
+          for (camp::idx_t k = 0; k < m_size; ++k)
+          {
+            expected += data1_h(i, k) * data2_h(k, j);
           }
-    //    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j, (double)expected, (double)data3_h(i,j));
+          //    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j,
+          //    (double)expected, (double)data3_h(i,j));
 
-          ASSERT_SCALAR_EQ(expected, data3_h(i,j));
+          ASSERT_SCALAR_EQ(expected, data3_h(i, j));
         }
       }
-
-
     }
   }
 
 
-
   //
   // Free data
   //
   tensor_free<policy_t>(data1_ptr);
   tensor_free<policy_t>(data2_ptr);
   tensor_free<policy_t>(data3_ptr);
-
 }
 
 
-
 TYPED_TEST_P(TestTensorMatrix, ET_MatrixMatrixMultiplyAdd)
 {
   ET_MatrixMatrixMultiplyAddImpl<TypeParam>();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixVector.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixVector.hpp
index 0d7f2fd137..e67e4a1389 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixVector.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixVector.hpp
@@ -8,74 +8,80 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_MatrixVector_HPP__
 #define __TEST_TENSOR_MATRIX_ET_MatrixVector_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_MatrixVectorImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
   using cvector_t = typename matrix_t::column_vector_type;
   using rvector_t = typename matrix_t::row_vector_type;
 
-//  static constexpr camp::idx_t N = 8; //matrix_t::s_num_rows*matrix_t::s_num_columns*2;
-  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
+  //  static constexpr camp::idx_t N = 8;
+  //  //matrix_t::s_num_rows*matrix_t::s_num_columns*2;
+  static constexpr camp::idx_t N =
+      RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns) * 2;
   //
   // Allocate Row-Major Data
   //
 
   // alloc data1 - The matrix
 
-  std::vector<element_t> data1_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data1_h(data1_vec.data());
+  std::vector<element_t> data1_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data1_h(
+      data1_vec.data());
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data1_d(data1_ptr);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data1_d(
+      data1_ptr);
 
 
   // alloc data2 - The input vector
 
   std::vector<element_t> data2_vec(N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_I,N>> data2_h(data2_vec.data());
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_I, N>> data2_h(
+      data2_vec.data());
 
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_I,N>> data2_d(data2_ptr);
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_I, N>> data2_d(data2_ptr);
 
 
   // alloc data3 - The output vector
 
   std::vector<element_t> data3_vec(N);
-  RAJA::View<element_t, RAJA::Layout<1,int,0>> data3_h(data3_vec.data(),  N);
-
-  element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<element_t, RAJA::Layout<1,int,0>> data3_d(data3_ptr,  N);
+  RAJA::View<element_t, RAJA::Layout<1, int, 0>> data3_h(data3_vec.data(), N);
 
+  element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
+  RAJA::View<element_t, RAJA::Layout<1, int, 0>> data3_d(data3_ptr, N);
 
 
   // Fill data1 and data2
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      data1_h(i,j) = 3+i*N+j;
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      data1_h(i, j) = 3 + i * N + j;
     }
-    data2_h(i) = i+1;
+    data2_h(i) = i + 1;
   }
 
-//  printf("data1:\n");
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("  ");
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%lf  ", (double)data1_h(i,j));
-//    }
-//    printf("\n");
-//  }
+  //  printf("data1:\n");
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("  ");
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%lf  ", (double)data1_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
 
 
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("data2[%d]=%lf\n", (int)i, (double)data2_h(i));
-//  }
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("data2[%d]=%lf\n", (int)i, (double)data2_h(i));
+  //  }
 
   tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
   tensor_copy_to_device<policy_t>(data2_ptr, data2_vec);
@@ -84,17 +90,18 @@ void ET_MatrixVectorImpl()
   //
   // Do Operation: A*x
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto cols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,N>();
-
-    auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::static_all();
-    auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::all();
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto cols =
+            RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
 
-    data3_d(vcol) = data1_d(rows, cols) * data2_d(vrow);
+        auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::static_all();
+        auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::all();
 
-  });
+        data3_d(vcol) = data1_d(rows, cols) * data2_d(vrow);
+      });
 
   tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
@@ -102,30 +109,36 @@ void ET_MatrixVectorImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < N; ++ i){
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
 
     element_t expected(0);
-    for(camp::idx_t j = 0;j < N; ++ j){
-      expected += data1_h(i,j)*data2_h(j);
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      expected += data1_h(i, j) * data2_h(j);
     }
-//    printf("i=%d, expected=%e, data3=%e\n", (int)i, (double)expected, (double)data3_h(i));
+    //    printf("i=%d, expected=%e, data3=%e\n", (int)i, (double)expected,
+    //    (double)data3_h(i));
 
     ASSERT_SCALAR_EQ(expected, data3_h(i));
   }
 
-//return;
+  // return;
 
 
   //
   // Loop over all possible sub-matrix sizes for A*x
   //
-  for(camp::idx_t n_size = 0;n_size <= N; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= N; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= N; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data3
       //
-      for(camp::idx_t i = 0;i < N; ++ i){
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
         data3_h(i) = 0;
       }
 
@@ -135,16 +148,20 @@ void ET_MatrixVectorImpl()
       //
       // Do Operation (x')*A
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-        // Load data using a View
-        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
-        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            // Load data using a View
+            auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+            auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
-        auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::range(0, m_size);
-        auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::range(0, n_size);
+            auto vrow =
+                RAJA::expt::VectorIndex<int, rvector_t>::range(0, m_size);
+            auto vcol =
+                RAJA::expt::VectorIndex<int, cvector_t>::range(0, n_size);
 
-        data3_d(vcol) = data1_d(rows, cols) * data2_d(vrow);
-      });
+            data3_d(vcol) = data1_d(rows, cols) * data2_d(vrow);
+          });
 
       tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
@@ -152,43 +169,43 @@ void ET_MatrixVectorImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < n_size; ++ i){
+      for (camp::idx_t i = 0; i < n_size; ++i)
+      {
 
 
         element_t expected(0);
-        for(camp::idx_t j = 0;j < m_size; ++ j){
-          expected += data1_h(i,j) * data2_h(j);
+        for (camp::idx_t j = 0; j < m_size; ++j)
+        {
+          expected += data1_h(i, j) * data2_h(j);
         }
 
-        if(i >= n_size || m_size == 0){
+        if (i >= n_size || m_size == 0)
+        {
           expected = 0;
         }
 
-//        printf("i=%d, expected=%e, data3=%e\n", (int)i, (double)expected, (double)data3_h(i));
+        //        printf("i=%d, expected=%e, data3=%e\n", (int)i,
+        //        (double)expected, (double)data3_h(i));
         ASSERT_SCALAR_EQ(expected, data3_h(i));
-
       }
-
-
     }
   }
 
 
-
   //
   // Do Operation: (x')*A
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-    auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+        auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::static_all();
+        auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::static_all();
 
-    auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::static_all();
-    auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::static_all();
-
-    data3_d(vrow) =  data2_d(vcol) * data1_d(rows, cols);
-
-  });
+        data3_d(vrow) = data2_d(vcol) * data1_d(rows, cols);
+      });
 
   tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
@@ -196,31 +213,35 @@ void ET_MatrixVectorImpl()
   //
   // Check results
   //
-  for(camp::idx_t j = 0;j < N; ++ j){
+  for (camp::idx_t j = 0; j < N; ++j)
+  {
 
 
     element_t expected(0);
-    for(camp::idx_t i = 0;i < N; ++ i){
-      expected += data2_h(i)*data1_h(i,j);
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      expected += data2_h(i) * data1_h(i, j);
     }
 
     ASSERT_SCALAR_EQ(expected, data3_h(j));
-//    printf("i=%d, data3=%lf, expected=%lf\n", (int)j, (double)data3_h(j), (double)expected);
+    //    printf("i=%d, data3=%lf, expected=%lf\n", (int)j, (double)data3_h(j),
+    //    (double)expected);
   }
 
 
-
-
   //
   // Loop over all possible sub-matrix sizes for (x')*A
   //
-  for(camp::idx_t n_size = 0;n_size <= N; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= N; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= N; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data3
       //
-      for(camp::idx_t j = 0;j < N; ++ j){
+      for (camp::idx_t j = 0; j < N; ++j)
+      {
         data3_h(j) = 0;
       }
 
@@ -230,16 +251,20 @@ void ET_MatrixVectorImpl()
       //
       // Do Operation (x')*A
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-        // Load data using a View
-        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
-        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            // Load data using a View
+            auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+            auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
-        auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::range(0, m_size);
-        auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::range(0, n_size);
+            auto vrow =
+                RAJA::expt::VectorIndex<int, rvector_t>::range(0, m_size);
+            auto vcol =
+                RAJA::expt::VectorIndex<int, cvector_t>::range(0, n_size);
 
-        data3_d(vrow) =  data2_d(vcol) * data1_d(rows, cols);
-      });
+            data3_d(vrow) = data2_d(vcol) * data1_d(rows, cols);
+          });
 
       tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
@@ -247,24 +272,25 @@ void ET_MatrixVectorImpl()
       //
       // Check results
       //
-      for(camp::idx_t j = 0;j < N; ++ j){
+      for (camp::idx_t j = 0; j < N; ++j)
+      {
 
         element_t expected(0);
 
-        for(camp::idx_t i = 0;i < n_size; ++ i){
-          expected += data2_h(i) * data1_h(i,j);
+        for (camp::idx_t i = 0; i < n_size; ++i)
+        {
+          expected += data2_h(i) * data1_h(i, j);
         }
 
-        if(j >= m_size || n_size == 0){
+        if (j >= m_size || n_size == 0)
+        {
           expected = 0;
         }
 
-//        printf("j=%d, expected=%e, data3=%e\n", (int)j, (double)expected, (double)data3_h(j));
+        //        printf("j=%d, expected=%e, data3=%e\n", (int)j,
+        //        (double)expected, (double)data3_h(j));
         ASSERT_SCALAR_EQ(expected, data3_h(j));
-
       }
-
-
     }
   }
 
@@ -275,11 +301,9 @@ void ET_MatrixVectorImpl()
   tensor_free<policy_t>(data1_ptr);
   tensor_free<policy_t>(data2_ptr);
   tensor_free<policy_t>(data3_ptr);
-
 }
 
 
-
 TYPED_TEST_P(TestTensorMatrix, ET_MatrixVector)
 {
   ET_MatrixVectorImpl<TypeParam>();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp
index 6336a2988d..a7ac9b4529 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp
@@ -8,18 +8,19 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_Negate_HPP__
 #define __TEST_TENSOR_MATRIX_ET_Negate_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_NegateImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
-  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
+  static constexpr camp::idx_t N =
+      RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns) * 2;
 
   //
   // Allocate Row-Major Data
@@ -27,73 +28,76 @@ void ET_NegateImpl()
 
   // alloc input0
 
-  std::vector<element_t> input0_vec(N*N);
+  std::vector<element_t> input0_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> input0_h(input0_vec.data(), N, N);
 
-  element_t *input0_ptr = tensor_malloc<policy_t>(input0_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> input0_d(input0_ptr,  N, N);
+  element_t* input0_ptr = tensor_malloc<policy_t>(input0_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> input0_d(input0_ptr, N, N);
 
 
   // alloc input1 with StaticLayout
 
-  std::vector<element_t> input1_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> input1_h(input1_vec.data());
+  std::vector<element_t> input1_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> input1_h(
+      input1_vec.data());
 
-  element_t *input1_ptr = tensor_malloc<policy_t>(input1_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> input1_d(input1_ptr);
+  element_t* input1_ptr = tensor_malloc<policy_t>(input1_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> input1_d(
+      input1_ptr);
 
 
   // alloc output0
 
-  std::vector<element_t> output0_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(),  N, N);
+  std::vector<element_t> output0_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(), N, N);
 
-  element_t *output0_ptr = tensor_malloc<policy_t>(output0_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output0_d(output0_ptr,  N, N);
+  element_t* output0_ptr = tensor_malloc<policy_t>(output0_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output0_d(output0_ptr, N, N);
 
 
   // alloc output1
 
-  std::vector<element_t> output1_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> output1_h(output1_vec.data(),  N, N);
+  std::vector<element_t> output1_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> output1_h(output1_vec.data(), N, N);
 
-  element_t *output1_ptr = tensor_malloc<policy_t>(output1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output1_d(output1_ptr,  N, N);
+  element_t* output1_ptr = tensor_malloc<policy_t>(output1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output1_d(output1_ptr, N, N);
 
 
   // alloc output2
 
-  std::vector<element_t> output2_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> output2_h(output2_vec.data(),  N, N);
+  std::vector<element_t> output2_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> output2_h(output2_vec.data(), N, N);
 
-  element_t *output2_ptr = tensor_malloc<policy_t>(output2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output2_d(output2_ptr,  N, N);
+  element_t* output2_ptr = tensor_malloc<policy_t>(output2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output2_d(output2_ptr, N, N);
 
 
   // alloc output3
 
-  std::vector<element_t> output3_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> output3_h(output3_vec.data(),  N, N);
+  std::vector<element_t> output3_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> output3_h(output3_vec.data(), N, N);
 
-  element_t *output3_ptr = tensor_malloc<policy_t>(output3_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output3_d(output3_ptr,  N, N);
+  element_t* output3_ptr = tensor_malloc<policy_t>(output3_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output3_d(output3_ptr, N, N);
 
 
   // alloc output4
 
-  std::vector<element_t> output4_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> output4_h(output4_vec.data(),  N, N);
-
-  element_t *output4_ptr = tensor_malloc<policy_t>(output4_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output4_d(output4_ptr,  N, N);
+  std::vector<element_t> output4_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> output4_h(output4_vec.data(), N, N);
 
+  element_t* output4_ptr = tensor_malloc<policy_t>(output4_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output4_d(output4_ptr, N, N);
 
 
   // Fill input0
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      input0_h(i,j) = i*matrix_t::s_num_columns+j;
-      input1_h(i,j) = i*matrix_t::s_num_columns+j;
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      input0_h(i, j) = i * matrix_t::s_num_columns + j;
+      input1_h(i, j) = i * matrix_t::s_num_columns + j;
     }
   }
 
@@ -104,25 +108,29 @@ void ET_NegateImpl()
   //
   // Do Operation: negation
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
-
-    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
-
-    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,N>();
-    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,N>();
-
-    output0_d(rows, cols) = -input0_d(rows, cols);
-
-    output1_d(rows, cols) = -input1_d(SArows, SRcols);  // mixed static_all and static_range
-    output2_d(rows, cols) = -input1_d(SArows, SAcols);  // static_all
-    output3_d(rows, cols) = -input1_d(SRrows, SRcols);  // static_range
-    output4_d(rows, cols) = -input1_d(rows, SRcols);    // mixed static_range and non-static
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+
+        auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+
+        auto SRrows =
+            RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
+        auto SRcols =
+            RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
+
+        output0_d(rows, cols) = -input0_d(rows, cols);
+
+        output1_d(rows, cols) =
+            -input1_d(SArows, SRcols);  // mixed static_all and static_range
+        output2_d(rows, cols) = -input1_d(SArows, SAcols);  // static_all
+        output3_d(rows, cols) = -input1_d(SRrows, SRcols);  // static_range
+        output4_d(rows, cols) =
+            -input1_d(rows, SRcols);  // mixed static_range and non-static
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_ptr);
   tensor_copy_to_host<policy_t>(output1_vec, output1_ptr);
@@ -134,18 +142,19 @@ void ET_NegateImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      ASSERT_SCALAR_EQ(output0_h(i,j), -input0_h(i,j));
-      ASSERT_SCALAR_EQ(output1_h(i,j), -input1_h(i,j));
-      ASSERT_SCALAR_EQ(output2_h(i,j), -input1_h(i,j));
-      ASSERT_SCALAR_EQ(output3_h(i,j), -input1_h(i,j));
-      ASSERT_SCALAR_EQ(output4_h(i,j), -input1_h(i,j));
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      ASSERT_SCALAR_EQ(output0_h(i, j), -input0_h(i, j));
+      ASSERT_SCALAR_EQ(output1_h(i, j), -input1_h(i, j));
+      ASSERT_SCALAR_EQ(output2_h(i, j), -input1_h(i, j));
+      ASSERT_SCALAR_EQ(output3_h(i, j), -input1_h(i, j));
+      ASSERT_SCALAR_EQ(output4_h(i, j), -input1_h(i, j));
     }
   }
 
 
-
   //
   // Free data
   //
@@ -156,15 +165,10 @@ void ET_NegateImpl()
   tensor_free<policy_t>(output2_ptr);
   tensor_free<policy_t>(output3_ptr);
   tensor_free<policy_t>(output4_ptr);
-
 }
 
 
-
-TYPED_TEST_P(TestTensorMatrix, ET_Negate)
-{
-  ET_NegateImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorMatrix, ET_Negate) { ET_NegateImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp
index dd95c11904..5b3d146938 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp
@@ -8,71 +8,77 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_Subtract_HPP__
 #define __TEST_TENSOR_MATRIX_ET_Subtract_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_SubtractImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
-  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
+  static constexpr camp::idx_t N =
+      RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns) * 2;
 
   //
   // Allocate Row-Major Data
   //
 
   // alloc data1
-  std::vector<element_t> data1_vec(N*N);
+  std::vector<element_t> data1_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), N, N);
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr,  N, N);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, N, N);
 
 
   // alloc data2
-  std::vector<element_t> data2_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(),  N, N);
+  std::vector<element_t> data2_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), N, N);
 
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr,  N, N);
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr, N, N);
 
 
   // alloc data3 with StaticLayout
-  std::vector<element_t> data3_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_h(data3_vec.data());
+  std::vector<element_t> data3_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data3_h(
+      data3_vec.data());
 
-  element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_d(data3_ptr);
+  element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data3_d(
+      data3_ptr);
 
 
   // alloc data4 with StaticLayout
-  std::vector<element_t> data4_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_h(data4_vec.data());
+  std::vector<element_t> data4_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data4_h(
+      data4_vec.data());
 
-  element_t *data4_ptr = tensor_malloc<policy_t>(data4_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_d(data4_ptr);
+  element_t* data4_ptr = tensor_malloc<policy_t>(data4_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data4_d(
+      data4_ptr);
 
 
   // alloc data5
-  std::vector<element_t> data5_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(),  N, N);
-
-  element_t *data5_ptr = tensor_malloc<policy_t>(data5_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr,  N, N);
+  std::vector<element_t> data5_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(), N, N);
 
+  element_t* data5_ptr = tensor_malloc<policy_t>(data5_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr, N, N);
 
 
   // Fill data1, data2, data3, and data4
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      data1_h(i,j) = i*matrix_t::s_num_columns+j;
-      data2_h(i,j) = 1+i+j;
-      data3_h(i,j) = i*matrix_t::s_num_columns+j;
-      data4_h(i,j) = 1+i+j;
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      data1_h(i, j) = i * matrix_t::s_num_columns + j;
+      data2_h(i, j) = 1 + i + j;
+      data3_h(i, j) = i * matrix_t::s_num_columns + j;
+      data4_h(i, j) = 1 + i + j;
     }
   }
 
@@ -85,26 +91,29 @@ void ET_SubtractImpl()
   //
   // Do Operation: Full sum of data1, data2, data3, and data4
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+        auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,N>();
-    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,N>();
+        auto SRrows =
+            RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
+        auto SRcols =
+            RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
 
-    // Access types:
-    // data1_d - Layout with all() and all().
-    // data2_d - Layout with all() and static_range(), which should default to normal Layout access.
-    // data3_d - StaticLayout with static_all() and static_range().
-    // data4_d - StaticLayout with static_all() and all().
-
-    data5_d(cols, rows) = data1_d(rows, cols) - data2_d(cols, SRrows) + data3_d(SArows, SRcols) - data4_d(SAcols, rows);
+        // Access types:
+        // data1_d - Layout with all() and all().
+        // data2_d - Layout with all() and static_range(), which should default
+        // to normal Layout access. data3_d - StaticLayout with static_all() and
+        // static_range(). data4_d - StaticLayout with static_all() and all().
 
-  });
+        data5_d(cols, rows) = data1_d(rows, cols) - data2_d(cols, SRrows) +
+                              data3_d(SArows, SRcols) - data4_d(SAcols, rows);
+      });
 
   tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
@@ -112,27 +121,34 @@ void ET_SubtractImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)-data2_h(j,i)+data3_h(i,j)-data4_h(j,i));
-//      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) - data2_h(j, i) +
+                                          data3_h(i, j) - data4_h(j, i));
+      //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+      //      data2(i,j));
     }
   }
 
 
-
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for(camp::idx_t n_size = 0;n_size <= N; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= N; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= N; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data5
       //
-      for(camp::idx_t i = 0;i < N; ++ i){
-        for(camp::idx_t j = 0;j < N; ++ j){
-          data5_h(j,i) = -1;
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
+        for (camp::idx_t j = 0; j < N; ++j)
+        {
+          data5_h(j, i) = -1;
         }
       }
       tensor_copy_to_device<policy_t>(data5_ptr, data5_vec);
@@ -141,16 +157,19 @@ void ET_SubtractImpl()
       //
       // Do Operation: Perform partial sum
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-        // Load data using a View
-        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
-        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            // Load data using a View
+            auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+            auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
-        // Access types:
-        // Layout with range() and range() because loop iterate cannot be determined statically.
+            // Access types:
+            // Layout with range() and range() because loop iterate cannot be
+            // determined statically.
 
-        data5_d(cols, rows) = data1_d(rows, cols) - data2_d(cols, rows);
-      });
+            data5_d(cols, rows) = data1_d(rows, cols) - data2_d(cols, rows);
+          });
 
       tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
@@ -158,19 +177,22 @@ void ET_SubtractImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < N; ++ i){
-        for(camp::idx_t j = 0;j < N; ++ j){
-//          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
-          if(i < n_size && j < m_size){
-            ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)-data2_h(j,i));
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
+        for (camp::idx_t j = 0; j < N; ++j)
+        {
+          //          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+          //          data2(i,j));
+          if (i < n_size && j < m_size)
+          {
+            ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) - data2_h(j, i));
           }
-          else{
-            ASSERT_SCALAR_EQ(element_t(-1), data5_h(j,i));
+          else
+          {
+            ASSERT_SCALAR_EQ(element_t(-1), data5_h(j, i));
           }
         }
       }
-
-
     }
   }
 
@@ -186,11 +208,7 @@ void ET_SubtractImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorMatrix, ET_Subtract)
-{
-  ET_SubtractImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorMatrix, ET_Subtract) { ET_SubtractImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp
index 9f40887dd1..18a3d44b5f 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp
@@ -8,20 +8,22 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_Transpose_HPP__
 #define __TEST_TENSOR_MATRIX_ET_Transpose_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_TransposeImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
   using transpose_t = typename matrix_t::transpose_type;
 
-//  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
-//  static constexpr camp::idx_t M = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
+  //  static constexpr camp::idx_t N =
+  //  RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
+  //  static constexpr camp::idx_t M =
+  //  RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
 
   static constexpr camp::idx_t N = matrix_t::s_num_rows;
   static constexpr camp::idx_t M = matrix_t::s_num_columns;
@@ -32,74 +34,76 @@ void ET_TransposeImpl()
 
   // alloc input0
 
-  std::vector<element_t> input0_vec(N*M);
+  std::vector<element_t> input0_vec(N * M);
   RAJA::View<element_t, RAJA::Layout<2>> input0_h(input0_vec.data(), N, M);
 
-  element_t *input0_ptr = tensor_malloc<policy_t>(input0_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> input0_d(input0_ptr,  N, M);
+  element_t* input0_ptr = tensor_malloc<policy_t>(input0_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> input0_d(input0_ptr, N, M);
 
 
   // alloc input1 with StaticLayout
 
-  std::vector<element_t> input1_vec(N*M);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,M>> input1_h(input1_vec.data());
-
-  element_t *input1_ptr = tensor_malloc<policy_t>(input1_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,M>> input1_d(input1_ptr);
+  std::vector<element_t> input1_vec(N * M);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, M>> input1_h(
+      input1_vec.data());
 
+  element_t* input1_ptr = tensor_malloc<policy_t>(input1_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, M>> input1_d(
+      input1_ptr);
 
 
   // alloc output0
 
-  std::vector<element_t> output0_vec(N*M);
-  RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(),  M, N);
+  std::vector<element_t> output0_vec(N * M);
+  RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(), M, N);
 
-  element_t *output0_ptr = tensor_malloc<policy_t>(output0_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output0_d(output0_ptr,  M, N);
+  element_t* output0_ptr = tensor_malloc<policy_t>(output0_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output0_d(output0_ptr, M, N);
 
 
   // alloc output1
 
-  std::vector<element_t> output1_vec(N*M);
-  RAJA::View<element_t, RAJA::Layout<2>> output1_h(output1_vec.data(),  M, N);
+  std::vector<element_t> output1_vec(N * M);
+  RAJA::View<element_t, RAJA::Layout<2>> output1_h(output1_vec.data(), M, N);
 
-  element_t *output1_ptr = tensor_malloc<policy_t>(output1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output1_d(output1_ptr,  M, N);
+  element_t* output1_ptr = tensor_malloc<policy_t>(output1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output1_d(output1_ptr, M, N);
 
 
   // alloc output2
 
-  std::vector<element_t> output2_vec(N*M);
-  RAJA::View<element_t, RAJA::Layout<2>> output2_h(output2_vec.data(),  M, N);
+  std::vector<element_t> output2_vec(N * M);
+  RAJA::View<element_t, RAJA::Layout<2>> output2_h(output2_vec.data(), M, N);
 
-  element_t *output2_ptr = tensor_malloc<policy_t>(output2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output2_d(output2_ptr,  M, N);
+  element_t* output2_ptr = tensor_malloc<policy_t>(output2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output2_d(output2_ptr, M, N);
 
 
   // alloc output3
 
-  std::vector<element_t> output3_vec(N*M);
-  RAJA::View<element_t, RAJA::Layout<2>> output3_h(output3_vec.data(),  M, N);
+  std::vector<element_t> output3_vec(N * M);
+  RAJA::View<element_t, RAJA::Layout<2>> output3_h(output3_vec.data(), M, N);
 
-  element_t *output3_ptr = tensor_malloc<policy_t>(output3_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output3_d(output3_ptr,  M, N);
+  element_t* output3_ptr = tensor_malloc<policy_t>(output3_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output3_d(output3_ptr, M, N);
 
 
   // alloc output4
 
-  std::vector<element_t> output4_vec(N*M);
-  RAJA::View<element_t, RAJA::Layout<2>> output4_h(output4_vec.data(),  M, N);
-
-  element_t *output4_ptr = tensor_malloc<policy_t>(output4_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output4_d(output4_ptr,  M, N);
+  std::vector<element_t> output4_vec(N * M);
+  RAJA::View<element_t, RAJA::Layout<2>> output4_h(output4_vec.data(), M, N);
 
+  element_t* output4_ptr = tensor_malloc<policy_t>(output4_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output4_d(output4_ptr, M, N);
 
 
   // Fill input0 and input1
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < M; ++ j){
-      input0_h(i,j) = i*matrix_t::s_num_columns+j;
-      input1_h(i,j) = i*matrix_t::s_num_columns+j;
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < M; ++j)
+    {
+      input0_h(i, j) = i * matrix_t::s_num_columns + j;
+      input1_h(i, j) = i * matrix_t::s_num_columns + j;
     }
   }
 
@@ -110,28 +114,36 @@ void ET_TransposeImpl()
   //
   // Do Operation: transpose
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
-    auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
-
-    auto rows_tr = RAJA::expt::RowIndex<int, transpose_t>::all();
-    auto cols_tr = RAJA::expt::ColIndex<int, transpose_t>::all();
-
-    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
-
-    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,N>();
-    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,M>();
-
-    output0_d(rows_tr, cols_tr) = input0_d(rows, cols).transpose();
-
-    output1_d(rows_tr, cols_tr) = input1_d(SArows, SRcols).transpose();  // mixed static_all and static_range
-    output2_d(rows_tr, cols_tr) = input1_d(SArows, SAcols).transpose();  // static_all
-    output3_d(rows_tr, cols_tr) = input1_d(SRrows, SRcols).transpose();  // static_range
-    output4_d(rows_tr, cols_tr) = input1_d(rows, SRcols).transpose();    // mixed static_range and non-static
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
+
+        auto rows_tr = RAJA::expt::RowIndex<int, transpose_t>::all();
+        auto cols_tr = RAJA::expt::ColIndex<int, transpose_t>::all();
+
+        auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+
+        auto SRrows =
+            RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
+        auto SRcols =
+            RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, M>();
+
+        output0_d(rows_tr, cols_tr) = input0_d(rows, cols).transpose();
+
+        output1_d(rows_tr, cols_tr) =
+            input1_d(SArows, SRcols)
+                .transpose();  // mixed static_all and static_range
+        output2_d(rows_tr, cols_tr) =
+            input1_d(SArows, SAcols).transpose();  // static_all
+        output3_d(rows_tr, cols_tr) =
+            input1_d(SRrows, SRcols).transpose();  // static_range
+        output4_d(rows_tr, cols_tr) =
+            input1_d(rows, SRcols)
+                .transpose();  // mixed static_range and non-static
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_ptr);
   tensor_copy_to_host<policy_t>(output1_vec, output1_ptr);
@@ -140,31 +152,30 @@ void ET_TransposeImpl()
   tensor_copy_to_host<policy_t>(output4_vec, output4_ptr);
 
 
-
-//  for(camp::idx_t i = 0;i < M; ++ i){
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%3d ", (int)output0_h(i,j));
-//    }
-//    printf("\n");
-//  }
-
+  //  for(camp::idx_t i = 0;i < M; ++ i){
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%3d ", (int)output0_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
 
 
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < M; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      ASSERT_SCALAR_EQ(output0_h(i,j), input0_h(j,i));
-      ASSERT_SCALAR_EQ(output1_h(i,j), input1_h(j,i));
-      ASSERT_SCALAR_EQ(output2_h(i,j), input1_h(j,i));
-      ASSERT_SCALAR_EQ(output3_h(i,j), input1_h(j,i));
-      ASSERT_SCALAR_EQ(output4_h(i,j), input1_h(j,i));
+  for (camp::idx_t i = 0; i < M; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      ASSERT_SCALAR_EQ(output0_h(i, j), input0_h(j, i));
+      ASSERT_SCALAR_EQ(output1_h(i, j), input1_h(j, i));
+      ASSERT_SCALAR_EQ(output2_h(i, j), input1_h(j, i));
+      ASSERT_SCALAR_EQ(output3_h(i, j), input1_h(j, i));
+      ASSERT_SCALAR_EQ(output4_h(i, j), input1_h(j, i));
     }
   }
 
 
-
   //
   // Free data
   //
@@ -175,15 +186,10 @@ void ET_TransposeImpl()
   tensor_free<policy_t>(output2_ptr);
   tensor_free<policy_t>(output3_ptr);
   tensor_free<policy_t>(output4_ptr);
-
 }
 
 
-
-TYPED_TEST_P(TestTensorMatrix, ET_Transpose)
-{
-  ET_TransposeImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorMatrix, ET_Transpose) { ET_TransposeImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp
index fff811c48f..bbf131075b 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_REGISTER_Load_ColMajor_HPP__
 #define __TEST_TENSOR_REGISTER_Load_ColMajor_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void Load_ColMajorImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
@@ -25,26 +25,34 @@ void Load_ColMajorImpl()
 
   // alloc data1
 
-  std::vector<element_t> data1_vec(4*matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), 2*matrix_t::s_num_columns, 2*matrix_t::s_num_rows);
+  std::vector<element_t> data1_vec(4 * matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_h(
+      data1_vec.data(), 2 * matrix_t::s_num_columns, 2 * matrix_t::s_num_rows);
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, 2*matrix_t::s_num_columns, 2*matrix_t::s_num_rows);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(
+      data1_ptr, 2 * matrix_t::s_num_columns, 2 * matrix_t::s_num_rows);
 
 
   // alloc data2
 
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(
+      data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_d(
+      data2_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
 
 
   // Fill data
-  for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-      data1_h(j,i) = 2*i*matrix_t::s_num_columns+j;
+  for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+    {
+      data1_h(j, i) = 2 * i * matrix_t::s_num_columns + j;
     }
   }
 
@@ -54,25 +62,30 @@ void Load_ColMajorImpl()
   //
   // Do operation
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-    matrix_t m;
-
-    if(matrix_t::layout_type::is_column_major()){
-      m.load_packed(data1_ptr, 1, 2*matrix_t::s_num_rows);
-    }
-    else{
-      m.load_strided(data1_ptr, 1, 2*matrix_t::s_num_rows);
-    }
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        matrix_t m;
 
-    // write out to a second view so we can check it on the host
-    // on GPU's we'll write way too much, but it should stil be correct
-    for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-      for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-        data2_d(j,i) = m.get(i,j);
-      }
-    }
+        if (matrix_t::layout_type::is_column_major())
+        {
+          m.load_packed(data1_ptr, 1, 2 * matrix_t::s_num_rows);
+        }
+        else
+        {
+          m.load_strided(data1_ptr, 1, 2 * matrix_t::s_num_rows);
+        }
 
-  });
+        // write out to a second view so we can check it on the host
+        // on GPU's we'll write way too much, but it should stil be correct
+        for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+        {
+          for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+          {
+            data2_d(j, i) = m.get(i, j);
+          }
+        }
+      });
 
   tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
 
@@ -80,27 +93,33 @@ void Load_ColMajorImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-      ASSERT_SCALAR_EQ(data1_h(j,i), data2_h(j,i));
-//      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(j,i), data2(j,i));
+  for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+    {
+      ASSERT_SCALAR_EQ(data1_h(j, i), data2_h(j, i));
+      //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(j,i),
+      //      data2(j,i));
     }
   }
 
 
-
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for(camp::idx_t n_size = 0;n_size <= matrix_t::s_num_rows; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= matrix_t::s_num_columns; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= matrix_t::s_num_rows; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= matrix_t::s_num_columns; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data2
       //
-      for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-          data2_h(j,i) = -1;
+      for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+        {
+          data2_h(j, i) = -1;
         }
       }
 
@@ -110,24 +129,31 @@ void Load_ColMajorImpl()
       //
       // Do Operation: Partial load
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-        matrix_t m;
-        if(matrix_t::layout_type::is_column_major()){
-          m.load_packed_nm(data1_ptr, 1, 2*matrix_t::s_num_rows, n_size, m_size);
-        }
-        else{
-          m.load_strided_nm(data1_ptr, 1, 2*matrix_t::s_num_rows, n_size, m_size);
-        }
-
-        // write out to a second view so we can check it on the host
-        // on GPU's we'll write way too much, but it should stil be correct
-        for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-          for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-            data2_d(j,i) = m.get(i,j);
-          }
-        }
-
-      });
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            matrix_t m;
+            if (matrix_t::layout_type::is_column_major())
+            {
+              m.load_packed_nm(data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size,
+                               m_size);
+            }
+            else
+            {
+              m.load_strided_nm(data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size,
+                                m_size);
+            }
+
+            // write out to a second view so we can check it on the host
+            // on GPU's we'll write way too much, but it should stil be correct
+            for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+            {
+              for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+              {
+                data2_d(j, i) = m.get(i, j);
+              }
+            }
+          });
 
       tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
 
@@ -135,19 +161,22 @@ void Load_ColMajorImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-//          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
-          if(i < n_size && j < m_size){
-            ASSERT_SCALAR_EQ(data1_h(j,i), data2_h(j,i));
+      for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+        {
+          //          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+          //          data2(i,j));
+          if (i < n_size && j < m_size)
+          {
+            ASSERT_SCALAR_EQ(data1_h(j, i), data2_h(j, i));
           }
-          else{
-            ASSERT_SCALAR_EQ(element_t(0), data2_h(j,i));
+          else
+          {
+            ASSERT_SCALAR_EQ(element_t(0), data2_h(j, i));
           }
         }
       }
-
-
     }
   }
 
@@ -160,7 +189,6 @@ void Load_ColMajorImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorMatrix, Load_ColMajor)
 {
   Load_ColMajorImpl<TypeParam>();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp
index 8cae00baec..84eee26474 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_REGISTER_Load_RowMajor_HPP__
 #define __TEST_TENSOR_REGISTER_Load_RowMajor_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void Load_RowMajorImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
@@ -25,27 +25,34 @@ void Load_RowMajorImpl()
 
   // alloc data1
 
-  std::vector<element_t> data1_vec(4*matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), 2*matrix_t::s_num_rows, 2*matrix_t::s_num_columns);
+  std::vector<element_t> data1_vec(4 * matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_h(
+      data1_vec.data(), 2 * matrix_t::s_num_rows, 2 * matrix_t::s_num_columns);
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, 2*matrix_t::s_num_rows, 2*matrix_t::s_num_columns);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(
+      data1_ptr, 2 * matrix_t::s_num_rows, 2 * matrix_t::s_num_columns);
 
 
   // alloc data2
 
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
-
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(
+      data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_d(
+      data2_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
 
 
   // Fill data
-  for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-      data1_h(i,j) = 2*i*matrix_t::s_num_columns+j;
+  for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+    {
+      data1_h(i, j) = 2 * i * matrix_t::s_num_columns + j;
     }
   }
 
@@ -55,24 +62,29 @@ void Load_RowMajorImpl()
   //
   // Do Operation: Full load
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-    matrix_t m;
-    if(matrix_t::layout_type::is_row_major()){
-      m.load_packed(data1_ptr, 2*matrix_t::s_num_columns, 1);
-    }
-    else{
-      m.load_strided(data1_ptr, 2*matrix_t::s_num_columns, 1);
-    }
-
-    // write out to a second view so we can check it on the host
-    // on GPU's we'll write way too much, but it should stil be correct
-    for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-      for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-        data2_d(i,j) = m.get(i,j);
-      }
-    }
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        matrix_t m;
+        if (matrix_t::layout_type::is_row_major())
+        {
+          m.load_packed(data1_ptr, 2 * matrix_t::s_num_columns, 1);
+        }
+        else
+        {
+          m.load_strided(data1_ptr, 2 * matrix_t::s_num_columns, 1);
+        }
 
-  });
+        // write out to a second view so we can check it on the host
+        // on GPU's we'll write way too much, but it should stil be correct
+        for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+        {
+          for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+          {
+            data2_d(i, j) = m.get(i, j);
+          }
+        }
+      });
 
   tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
 
@@ -80,27 +92,33 @@ void Load_RowMajorImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-      ASSERT_SCALAR_EQ(data1_h(i,j), data2_h(i,j));
-//      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
+  for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+    {
+      ASSERT_SCALAR_EQ(data1_h(i, j), data2_h(i, j));
+      //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+      //      data2(i,j));
     }
   }
 
 
-
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for(camp::idx_t n_size = 0;n_size <= matrix_t::s_num_rows; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= matrix_t::s_num_columns; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= matrix_t::s_num_rows; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= matrix_t::s_num_columns; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data2
       //
-      for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-          data2_h(i,j) = -1;
+      for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+        {
+          data2_h(i, j) = -1;
         }
       }
       tensor_copy_to_device<policy_t>(data2_ptr, data2_vec);
@@ -109,24 +127,31 @@ void Load_RowMajorImpl()
       //
       // Do Operation: Partial load
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-        matrix_t m;
-        if(matrix_t::layout_type::is_row_major()){
-          m.load_packed_nm(data1_ptr, 2*matrix_t::s_num_columns, 1, n_size, m_size);
-        }
-        else{
-          m.load_strided_nm(data1_ptr, 2*matrix_t::s_num_columns, 1, n_size, m_size);
-        }
-
-        // write out to a second view so we can check it on the host
-        // on GPU's we'll write way too much, but it should stil be correct
-        for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-          for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-            data2_d(i,j) = m.get(i,j);
-          }
-        }
-
-      });
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            matrix_t m;
+            if (matrix_t::layout_type::is_row_major())
+            {
+              m.load_packed_nm(data1_ptr, 2 * matrix_t::s_num_columns, 1,
+                               n_size, m_size);
+            }
+            else
+            {
+              m.load_strided_nm(data1_ptr, 2 * matrix_t::s_num_columns, 1,
+                                n_size, m_size);
+            }
+
+            // write out to a second view so we can check it on the host
+            // on GPU's we'll write way too much, but it should stil be correct
+            for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+            {
+              for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+              {
+                data2_d(i, j) = m.get(i, j);
+              }
+            }
+          });
 
       tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
 
@@ -134,19 +159,22 @@ void Load_RowMajorImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-//          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
-          if(i < n_size && j < m_size){
-            ASSERT_SCALAR_EQ(data1_h(i,j), data2_h(i,j));
+      for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+        {
+          //          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+          //          data2(i,j));
+          if (i < n_size && j < m_size)
+          {
+            ASSERT_SCALAR_EQ(data1_h(i, j), data2_h(i, j));
           }
-          else{
-            ASSERT_SCALAR_EQ(element_t(0), data2_h(i,j));
+          else
+          {
+            ASSERT_SCALAR_EQ(element_t(0), data2_h(i, j));
           }
         }
       }
-
-
     }
   }
 
@@ -159,7 +187,6 @@ void Load_RowMajorImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorMatrix, Load_RowMajor)
 {
   Load_RowMajorImpl<TypeParam>();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp
index 0961e3722d..b107b919e2 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_MATRIX_Store_ColMajor_HPP__
 #define __TEST_TENSOR_MATRIX_Store_ColMajor_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void Store_ColMajorImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
@@ -25,34 +25,43 @@ void Store_ColMajorImpl()
 
   // alloc data1 - matrix data will be generated on device, stored into data1
 
-  std::vector<element_t> data1_vec(4*matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), 2*matrix_t::s_num_columns, 2*matrix_t::s_num_rows);
+  std::vector<element_t> data1_vec(4 * matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_h(
+      data1_vec.data(), 2 * matrix_t::s_num_columns, 2 * matrix_t::s_num_rows);
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, 2*matrix_t::s_num_rows, 2*matrix_t::s_num_columns);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(
+      data1_ptr, 2 * matrix_t::s_num_rows, 2 * matrix_t::s_num_columns);
 
 
   // alloc data2 - reference data to compare with data1 on host
 
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(
+      data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
 
   //
   // Fill reference data
   //
-  for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-      data2_h(j,i) = 2*i*matrix_t::s_num_columns+j;
+  for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+    {
+      data2_h(j, i) = 2 * i * matrix_t::s_num_columns + j;
     }
   }
 
   //
   // Clear data1
   //
-  for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-      data1_h(j,i) = element_t(-2);
+  for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+    {
+      data1_h(j, i) = element_t(-2);
     }
   }
   tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
@@ -61,25 +70,30 @@ void Store_ColMajorImpl()
   //
   // Do Operation: Full store
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    // fill out matrix
-    matrix_t m(-1.0);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // fill out matrix
+        matrix_t m(-1.0);
 
-    for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-      for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-        m.set(2*i*matrix_t::s_num_columns+j, i, j);
-      }
-    }
+        for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+        {
+          for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+          {
+            m.set(2 * i * matrix_t::s_num_columns + j, i, j);
+          }
+        }
 
-    // Store matrix to memory
-    if(matrix_t::layout_type::is_column_major()){
-      m.store_packed(data1_ptr, 1, 2*matrix_t::s_num_rows);
-    }
-    else{
-      m.store_strided(data1_ptr, 1, 2*matrix_t::s_num_rows);
-    }
-  });
+        // Store matrix to memory
+        if (matrix_t::layout_type::is_column_major())
+        {
+          m.store_packed(data1_ptr, 1, 2 * matrix_t::s_num_rows);
+        }
+        else
+        {
+          m.store_strided(data1_ptr, 1, 2 * matrix_t::s_num_rows);
+        }
+      });
 
   tensor_copy_to_host<policy_t>(data1_vec, data1_ptr);
 
@@ -87,33 +101,41 @@ void Store_ColMajorImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-      if(i < matrix_t::s_num_rows && j < matrix_t::s_num_columns){
-//        printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j), data2_h(i,j));
-        ASSERT_SCALAR_EQ(data1_h(j,i), data2_h(j,i));
+  for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+    {
+      if (i < matrix_t::s_num_rows && j < matrix_t::s_num_columns)
+      {
+        //        printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j),
+        //        data2_h(i,j));
+        ASSERT_SCALAR_EQ(data1_h(j, i), data2_h(j, i));
       }
-      else{
-//        printf("%d,%d:  %lf, -2\n", (int)i, (int)j, data1_h(i,j));
-        ASSERT_SCALAR_EQ(data1_h(j,i), element_t(-2));
+      else
+      {
+        //        printf("%d,%d:  %lf, -2\n", (int)i, (int)j, data1_h(i,j));
+        ASSERT_SCALAR_EQ(data1_h(j, i), element_t(-2));
       }
     }
   }
 
 
-
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for(camp::idx_t n_size = 0;n_size <= matrix_t::s_num_rows; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= matrix_t::s_num_columns; ++ m_size){
+  for (camp::idx_t n_size = 0; n_size <= matrix_t::s_num_rows; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= matrix_t::s_num_columns; ++m_size)
+    {
 
       //
       // Clear data1
       //
-      for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-          data1_h(j,i) = element_t(-2);
+      for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+        {
+          data1_h(j, i) = element_t(-2);
         }
       }
       tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
@@ -122,25 +144,32 @@ void Store_ColMajorImpl()
       //
       // Do Operation: Partial Store
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-        // fill out matrix
-        matrix_t m(-1.0);
-
-        for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-          for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-            m.set(2*i*matrix_t::s_num_columns+j, i, j);
-          }
-        }
-
-        // Store matrix to memory
-        if(matrix_t::layout_type::is_column_major()){
-          m.store_packed_nm(data1_ptr, 1, 2*matrix_t::s_num_rows, n_size, m_size);
-        }
-        else{
-          m.store_strided_nm(data1_ptr, 1, 2*matrix_t::s_num_rows, n_size, m_size);
-        }
-
-      });
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            // fill out matrix
+            matrix_t m(-1.0);
+
+            for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+            {
+              for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+              {
+                m.set(2 * i * matrix_t::s_num_columns + j, i, j);
+              }
+            }
+
+            // Store matrix to memory
+            if (matrix_t::layout_type::is_column_major())
+            {
+              m.store_packed_nm(data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size,
+                                m_size);
+            }
+            else
+            {
+              m.store_strided_nm(data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size,
+                                 m_size);
+            }
+          });
 
 
       tensor_copy_to_host<policy_t>(data1_vec, data1_ptr);
@@ -149,20 +178,24 @@ void Store_ColMajorImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-          if(i < n_size && j < m_size){
-//            printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j), data2_h(i,j));
-            ASSERT_SCALAR_EQ(data1_h(j,i), data2_h(j,i));
+      for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+        {
+          if (i < n_size && j < m_size)
+          {
+            //            printf("%d,%d:  %lf, %lf\n", (int)i, (int)j,
+            //            data1_h(i,j), data2_h(i,j));
+            ASSERT_SCALAR_EQ(data1_h(j, i), data2_h(j, i));
           }
-          else{
-//            printf("%d,%d:  %lf, -2\n", (int)i, (int)j, data1_h(i,j));
-            ASSERT_SCALAR_EQ(data1_h(j,i), element_t(-2));
+          else
+          {
+            //            printf("%d,%d:  %lf, -2\n", (int)i, (int)j,
+            //            data1_h(i,j));
+            ASSERT_SCALAR_EQ(data1_h(j, i), element_t(-2));
           }
         }
       }
-
-
     }
   }
 
@@ -174,7 +207,6 @@ void Store_ColMajorImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorMatrix, Store_ColMajor)
 {
   Store_ColMajorImpl<TypeParam>();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp
index 94172b4342..ae3d9b5fba 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_MATRIX_Store_RowMajor_HPP__
 #define __TEST_TENSOR_MATRIX_Store_RowMajor_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void Store_RowMajorImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
   //
@@ -24,34 +24,43 @@ void Store_RowMajorImpl()
 
   // alloc data1 - matrix data will be generated on device, stored into data1
 
-  std::vector<element_t> data1_vec(4*matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), 2*matrix_t::s_num_rows, 2*matrix_t::s_num_columns);
+  std::vector<element_t> data1_vec(4 * matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_h(
+      data1_vec.data(), 2 * matrix_t::s_num_rows, 2 * matrix_t::s_num_columns);
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, 2*matrix_t::s_num_rows, 2*matrix_t::s_num_columns);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(
+      data1_ptr, 2 * matrix_t::s_num_rows, 2 * matrix_t::s_num_columns);
 
 
   // alloc data2 - reference data to compare with data1 on host
 
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(
+      data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
 
   //
   // Fill reference data
   //
-  for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-      data2_h(i,j) = 2*i*matrix_t::s_num_columns+j;
+  for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+    {
+      data2_h(i, j) = 2 * i * matrix_t::s_num_columns + j;
     }
   }
 
   //
   // Clear data1
   //
-  for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-      data1_h(i,j) = element_t(-2);
+  for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+    {
+      data1_h(i, j) = element_t(-2);
     }
   }
   tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
@@ -60,25 +69,30 @@ void Store_RowMajorImpl()
   //
   // Do Operation: Full store
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    // fill out matrix
-    matrix_t m(-1.0);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // fill out matrix
+        matrix_t m(-1.0);
 
-    for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-      for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-        m.set(2*i*matrix_t::s_num_columns+j, i, j);
-      }
-    }
+        for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+        {
+          for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+          {
+            m.set(2 * i * matrix_t::s_num_columns + j, i, j);
+          }
+        }
 
-    // Store matrix to memory
-    if(matrix_t::layout_type::is_row_major()){
-      m.store_packed(data1_ptr, 2*matrix_t::s_num_columns, 1);
-    }
-    else{
-      m.store_strided(data1_ptr, 2*matrix_t::s_num_columns, 1);
-    }
-  });
+        // Store matrix to memory
+        if (matrix_t::layout_type::is_row_major())
+        {
+          m.store_packed(data1_ptr, 2 * matrix_t::s_num_columns, 1);
+        }
+        else
+        {
+          m.store_strided(data1_ptr, 2 * matrix_t::s_num_columns, 1);
+        }
+      });
 
   tensor_copy_to_host<policy_t>(data1_vec, data1_ptr);
 
@@ -86,33 +100,41 @@ void Store_RowMajorImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-      if(i < matrix_t::s_num_rows && j < matrix_t::s_num_columns){
-//        printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j), data2_h(i,j));
-        ASSERT_SCALAR_EQ(data1_h(i,j), data2_h(i,j));
+  for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+    {
+      if (i < matrix_t::s_num_rows && j < matrix_t::s_num_columns)
+      {
+        //        printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j),
+        //        data2_h(i,j));
+        ASSERT_SCALAR_EQ(data1_h(i, j), data2_h(i, j));
       }
-      else{
-//        printf("%d,%d:  %lf, -2\n", (int)i, (int)j, data1_h(i,j));
-        ASSERT_SCALAR_EQ(data1_h(i,j), element_t(-2));
+      else
+      {
+        //        printf("%d,%d:  %lf, -2\n", (int)i, (int)j, data1_h(i,j));
+        ASSERT_SCALAR_EQ(data1_h(i, j), element_t(-2));
       }
     }
   }
 
 
-
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for(camp::idx_t n_size = 0;n_size <= matrix_t::s_num_rows; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= matrix_t::s_num_columns; ++ m_size){
+  for (camp::idx_t n_size = 0; n_size <= matrix_t::s_num_rows; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= matrix_t::s_num_columns; ++m_size)
+    {
 
       //
       // Clear data1
       //
-      for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-          data1_h(i,j) = element_t(-2);
+      for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+        {
+          data1_h(i, j) = element_t(-2);
         }
       }
       tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
@@ -121,25 +143,32 @@ void Store_RowMajorImpl()
       //
       // Do Operation: Partial Store
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-        // fill out matrix
-        matrix_t m(-1.0);
-
-        for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-          for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-            m.set(2*i*matrix_t::s_num_columns+j, i, j);
-          }
-        }
-
-        // Store matrix to memory
-        if(matrix_t::layout_type::is_row_major()){
-          m.store_packed_nm(data1_ptr, 2*matrix_t::s_num_columns, 1, n_size, m_size);
-        }
-        else{
-          m.store_strided_nm(data1_ptr, 2*matrix_t::s_num_columns, 1, n_size, m_size);
-        }
-
-      });
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            // fill out matrix
+            matrix_t m(-1.0);
+
+            for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+            {
+              for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+              {
+                m.set(2 * i * matrix_t::s_num_columns + j, i, j);
+              }
+            }
+
+            // Store matrix to memory
+            if (matrix_t::layout_type::is_row_major())
+            {
+              m.store_packed_nm(data1_ptr, 2 * matrix_t::s_num_columns, 1,
+                                n_size, m_size);
+            }
+            else
+            {
+              m.store_strided_nm(data1_ptr, 2 * matrix_t::s_num_columns, 1,
+                                 n_size, m_size);
+            }
+          });
 
 
       tensor_copy_to_host<policy_t>(data1_vec, data1_ptr);
@@ -148,20 +177,24 @@ void Store_RowMajorImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-          if(i < n_size && j < m_size){
-//            printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j), data2_h(i,j));
-            ASSERT_SCALAR_EQ(data1_h(i,j), data2_h(i,j));
+      for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+        {
+          if (i < n_size && j < m_size)
+          {
+            //            printf("%d,%d:  %lf, %lf\n", (int)i, (int)j,
+            //            data1_h(i,j), data2_h(i,j));
+            ASSERT_SCALAR_EQ(data1_h(i, j), data2_h(i, j));
           }
-          else{
-//            printf("%d,%d:  %lf, -2\n", (int)i, (int)j, data1_h(i,j));
-            ASSERT_SCALAR_EQ(data1_h(i,j), element_t(-2));
+          else
+          {
+            //            printf("%d,%d:  %lf, -2\n", (int)i, (int)j,
+            //            data1_h(i,j));
+            ASSERT_SCALAR_EQ(data1_h(i, j), element_t(-2));
           }
         }
       }
-
-
     }
   }
 
@@ -173,7 +206,6 @@ void Store_RowMajorImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorMatrix, Store_RowMajor)
 {
   Store_RowMajorImpl<TypeParam>();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Transpose.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Transpose.hpp
index 1be42b1ab8..dbd1b14c9a 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Transpose.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Transpose.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_MATRIX_Transpose_HPP__
 #define __TEST_TENSOR_MATRIX_Transpose_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void TransposeImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
   using transpose_t = typename matrix_t::transpose_type;
@@ -24,7 +24,7 @@ void TransposeImpl()
   static constexpr camp::idx_t N = matrix_t::s_num_rows;
   static constexpr camp::idx_t M = matrix_t::s_num_columns;
 
-//  bool is_row_major = matrix_t::layout_type::is_row_major();
+  //  bool is_row_major = matrix_t::layout_type::is_row_major();
 
   //
   // Allocate Row-Major Data
@@ -32,91 +32,86 @@ void TransposeImpl()
 
   // alloc input0
 
-  std::vector<element_t> input0_vec(N*M);
+  std::vector<element_t> input0_vec(N * M);
   RAJA::View<element_t, RAJA::Layout<2>> input0_h(input0_vec.data(), N, M);
 
-  element_t *input0_ptr = tensor_malloc<policy_t>(input0_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> input0_d(input0_ptr,  N, M);
-
+  element_t* input0_ptr = tensor_malloc<policy_t>(input0_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> input0_d(input0_ptr, N, M);
 
 
   // alloc output0
 
-  std::vector<element_t> output0_vec(N*M);
-  RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(),  M, N);
-
-  element_t *output0_ptr = tensor_malloc<policy_t>(output0_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output0_d(output0_ptr,  M, N);
+  std::vector<element_t> output0_vec(N * M);
+  RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(), M, N);
 
+  element_t* output0_ptr = tensor_malloc<policy_t>(output0_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output0_d(output0_ptr, M, N);
 
 
   // Fill input0 and output0
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < M; ++ j){
-      input0_h(i,j) = i*matrix_t::s_num_columns+j;
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < M; ++j)
+    {
+      input0_h(i, j) = i * matrix_t::s_num_columns + j;
     }
   }
 
   tensor_copy_to_device<policy_t>(input0_ptr, input0_vec);
 
 
-
-
   //
   // Do Operation: transpose
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    // load original matrix
-    matrix_t A;
-    A.load_strided(input0_ptr, M, 1);
-
-    // transpose matrix
-    transpose_t B = A.transpose();
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // load original matrix
+        matrix_t A;
+        A.load_strided(input0_ptr, M, 1);
 
-    // store transposed matrix
-    B.store_strided(output0_ptr, N, 1);
+        // transpose matrix
+        transpose_t B = A.transpose();
 
-  });
+        // store transposed matrix
+        B.store_strided(output0_ptr, N, 1);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_ptr);
 
 
   printf("gtest result:\n");
-  for(camp::idx_t i = 0;i < M; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      printf("%3d ", (int)output0_h(i,j));
+  for (camp::idx_t i = 0; i < M; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      printf("%3d ", (int)output0_h(i, j));
     }
     printf("\n");
   }
 
 
-
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < M; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      ASSERT_SCALAR_EQ(output0_h(i,j), input0_h(j,i));
+  for (camp::idx_t i = 0; i < M; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      ASSERT_SCALAR_EQ(output0_h(i, j), input0_h(j, i));
     }
   }
 
 
-
   //
   // Free data
   //
   tensor_free<policy_t>(input0_ptr);
   tensor_free<policy_t>(output0_ptr);
-
 }
 
 
-
-TYPED_TEST_P(TestTensorMatrix, Transpose)
-{
-  TransposeImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorMatrix, Transpose) { TransposeImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Add.hpp b/test/functional/tensor/register/tests/test-tensor-register-Add.hpp
index ae9a93c3ad..b936803efd 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Add.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Add.hpp
@@ -8,35 +8,36 @@
 #ifndef __TEST_TENSOR_REGISTER_Add_HPP__
 #define __TEST_TENSOR_REGISTER_Add_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void AddImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_hptr = input1_vec.data();
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_hptr = input1_vec.data();
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t  i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -48,93 +49,95 @@ void AddImpl()
   //
 
   // operator +
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t x;
-    x.load_packed(input0_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
+        register_t z = x + y;
 
-    register_t z = x + y;
-
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] + input1_vec[lane], output0_vec[lane]);
   }
 
 
-
   // operator +=
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t x;
-    x.load_packed(input0_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
+        register_t z = x;
 
-    register_t z = x;
+        z += y;
 
-    z += y;
-
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] + input1_vec[lane], output0_vec[lane]);
   }
 
 
-
-
   // operator + scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t z = x + 7;
+        register_t z = x + 7;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] + 7, output0_vec[lane]);
   }
 
 
-
-
   // operator += scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t z = x;
+        register_t z = x;
 
-    z += 3;
+        z += 3;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] + 3, output0_vec[lane]);
   }
 
 
-
   // Cleanup
   tensor_free<policy_t>(input0_dptr);
   tensor_free<policy_t>(input1_dptr);
@@ -142,11 +145,7 @@ void AddImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Add)
-{
-  AddImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Add) { AddImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Divide.hpp b/test/functional/tensor/register/tests/test-tensor-register-Divide.hpp
index 33efe4ba27..7ba22b6a80 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Divide.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Divide.hpp
@@ -8,35 +8,36 @@
 #ifndef __TEST_TENSOR_REGISTER_Divide_HPP__
 #define __TEST_TENSOR_REGISTER_Divide_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void DivideImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_hptr = input1_vec.data();
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_hptr = input1_vec.data();
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -48,88 +49,91 @@ void DivideImpl()
   //
 
   // operator /
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t x;
-    x.load_packed(input0_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
+        register_t z = x / y;
 
-    register_t z = x / y;
-
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] / input1_vec[lane], output0_vec[lane]);
   }
 
 
-
   // operator /=
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t x;
-    x.load_packed(input0_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
+        register_t z = x;
 
-    register_t z = x;
+        z /= y;
 
-    z /= y;
-
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] / input1_vec[lane], output0_vec[lane]);
   }
 
 
-
-
   // operator / scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t z = x / 7;
+        register_t z = x / 7;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] / 7, output0_vec[lane]);
   }
 
 
-
-
   // operator += scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t x;
-    x.load_packed(input0_dptr);
+        register_t z = x;
 
-    register_t z = x;
+        z /= 3;
 
-    z /= 3;
-
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] / 3, output0_vec[lane]);
   }
 
@@ -137,33 +141,37 @@ void DivideImpl()
   //
   // Test variable length operations for all valid lengths
   //
-  for(camp::idx_t  N = 0;N < num_elem; ++N){
-
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  for (camp::idx_t N = 0; N < num_elem; ++N)
+  {
 
-      register_t x;
-      x.load_packed_n(input0_dptr, N);
+    tensor_do<policy_t>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          register_t x;
+          x.load_packed_n(input0_dptr, N);
 
-      register_t y;
-      y.load_packed_n(input1_dptr, N);
+          register_t y;
+          y.load_packed_n(input1_dptr, N);
 
-      register_t z = x.divide_n(y,N);
+          register_t z = x.divide_n(y, N);
 
-      z.store_packed(output0_dptr);
-    });
+          z.store_packed(output0_dptr);
+        });
 
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-    for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
-      if(lane < N){
-        ASSERT_SCALAR_EQ(input0_vec[lane] / input1_vec[lane], output0_vec[lane]);
+    for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+    {
+      if (lane < N)
+      {
+        ASSERT_SCALAR_EQ(input0_vec[lane] / input1_vec[lane],
+                         output0_vec[lane]);
       }
-      else{
+      else
+      {
         ASSERT_SCALAR_EQ(0, output0_vec[lane]);
       }
     }
-
-
   }
 
 
@@ -174,11 +182,7 @@ void DivideImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Divide)
-{
-  DivideImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Divide) { DivideImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-DotProduct.hpp b/test/functional/tensor/register/tests/test-tensor-register-DotProduct.hpp
index 6a414dd7d0..dcd47e50e0 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-DotProduct.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-DotProduct.hpp
@@ -8,35 +8,36 @@
 #ifndef __TEST_TENSOR_REGISTER_DotProduct_HPP__
 #define __TEST_TENSOR_REGISTER_DotProduct_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void DotProductImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_hptr = input1_vec.data();
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_hptr = input1_vec.data();
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(1);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(1);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(1);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -47,28 +48,29 @@ void DotProductImpl()
   //  Check full-length operations
   //
 
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t x;
-    x.load_packed(input0_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
 
-
-    output0_dptr[0] = x.dot(y);
-  });
+        output0_dptr[0] = x.dot(y);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   element_t expected = 0;
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     expected += input0_vec[lane] * input1_vec[lane];
   }
   ASSERT_SCALAR_EQ(expected, output0_vec[0]);
 
 
-
   // Cleanup
   tensor_free<policy_t>(input0_dptr);
   tensor_free<policy_t>(input1_dptr);
@@ -76,11 +78,7 @@ void DotProductImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, DotProduct)
-{
-  DotProductImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, DotProduct) { DotProductImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-FMA.hpp b/test/functional/tensor/register/tests/test-tensor-register-FMA.hpp
index e03529d183..f2294ab3ae 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-FMA.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-FMA.hpp
@@ -8,40 +8,41 @@
 #ifndef __TEST_TENSOR_REGISTER_FMA_HPP__
 #define __TEST_TENSOR_REGISTER_FMA_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void FMAImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_hptr = input1_vec.data();
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_hptr = input1_vec.data();
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input2_vec(num_elem);
-  element_t *input2_hptr = input2_vec.data();
-  element_t *input2_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input2_hptr = input2_vec.data();
+  element_t* input2_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
-   input2_hptr[i] = (element_t)(i+i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
+    input2_hptr[i] = (element_t)(i + i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -54,31 +55,32 @@ void FMAImpl()
   //
 
   // operator z = a*b+c
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t a;
+        a.load_packed(input0_dptr);
 
-    register_t a;
-    a.load_packed(input0_dptr);
+        register_t b;
+        b.load_packed(input1_dptr);
 
-    register_t b;
-    b.load_packed(input1_dptr);
+        register_t c;
+        c.load_packed(input2_dptr);
 
-    register_t c;
-    c.load_packed(input2_dptr);
+        register_t z = a.multiply_add(b, c);
 
-    register_t z = a.multiply_add(b,c);
-
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
-    ASSERT_SCALAR_EQ(input0_vec[lane] * input1_vec[lane] + input2_vec[lane], output0_vec[lane]);
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
+    ASSERT_SCALAR_EQ(input0_vec[lane] * input1_vec[lane] + input2_vec[lane],
+                     output0_vec[lane]);
   }
 
 
-
-
   // Cleanup
   tensor_free<policy_t>(input0_dptr);
   tensor_free<policy_t>(input1_dptr);
@@ -87,11 +89,7 @@ void FMAImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, FMA)
-{
-  FMAImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, FMA) { FMAImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-FMS.hpp b/test/functional/tensor/register/tests/test-tensor-register-FMS.hpp
index 2f8b53c0c9..05015c5560 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-FMS.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-FMS.hpp
@@ -8,40 +8,41 @@
 #ifndef __TEST_TENSOR_REGISTER_FMS_HPP__
 #define __TEST_TENSOR_REGISTER_FMS_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void FMSImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_hptr = input1_vec.data();
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_hptr = input1_vec.data();
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input2_vec(num_elem);
-  element_t *input2_hptr = input2_vec.data();
-  element_t *input2_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input2_hptr = input2_vec.data();
+  element_t* input2_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
-   input2_hptr[i] = (element_t)(i+i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
+    input2_hptr[i] = (element_t)(i + i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -54,31 +55,32 @@ void FMSImpl()
   //
 
   // operator z = a*b-c
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t a;
+        a.load_packed(input0_dptr);
 
-    register_t a;
-    a.load_packed(input0_dptr);
+        register_t b;
+        b.load_packed(input1_dptr);
 
-    register_t b;
-    b.load_packed(input1_dptr);
+        register_t c;
+        c.load_packed(input2_dptr);
 
-    register_t c;
-    c.load_packed(input2_dptr);
+        register_t z = a.multiply_subtract(b, c);
 
-    register_t z = a.multiply_subtract(b,c);
-
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
-    ASSERT_SCALAR_EQ(input0_vec[lane] * input1_vec[lane] - input2_vec[lane], output0_vec[lane]);
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
+    ASSERT_SCALAR_EQ(input0_vec[lane] * input1_vec[lane] - input2_vec[lane],
+                     output0_vec[lane]);
   }
 
 
-
-
   // Cleanup
   tensor_free<policy_t>(input0_dptr);
   tensor_free<policy_t>(input1_dptr);
@@ -87,11 +89,7 @@ void FMSImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, FMS)
-{
-  FMSImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, FMS) { FMSImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Gather.hpp b/test/functional/tensor/register/tests/test-tensor-register-Gather.hpp
index 37429b5087..3aa665712c 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Gather.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Gather.hpp
@@ -8,43 +8,45 @@
 #ifndef __TEST_TENSOR_REGISTER_Gather_HPP__
 #define __TEST_TENSOR_REGISTER_Gather_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void GatherImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // get the integer indexing types
   using int_register_t = typename register_t::int_vector_type;
-  using index_t = typename int_register_t::element_type;
+  using index_t        = typename int_register_t::element_type;
 
   // Allocate
 
   // Data to be read (10x larger than output)
-  std::vector<element_t> input0_vec(10*num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(10*num_elem);
+  std::vector<element_t> input0_vec(10 * num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(10 * num_elem);
 
   // Indexing into input0
   std::vector<index_t> input1_vec(num_elem);
-  index_t *input1_hptr = input1_vec.data();
-  index_t *input1_dptr = tensor_malloc<policy_t, index_t>(num_elem);
+  index_t* input1_hptr = input1_vec.data();
+  index_t* input1_dptr = tensor_malloc<policy_t, index_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < 10*num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < 10 * num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
   }
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input1_hptr[i] = (index_t)(3*i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input1_hptr[i] = (index_t)(3 * i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -56,23 +58,25 @@ void GatherImpl()
   //
 
   // operator z[i] = a[b[i]]
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // get offsets
+        int_register_t idx;
+        idx.load_packed(input1_dptr);
 
-    // get offsets
-    int_register_t idx;
-    idx.load_packed(input1_dptr);
+        // gather elements from a given offsets in idx
+        register_t a;
+        a.gather(input0_dptr, idx);
 
-    // gather elements from a given offsets in idx
-    register_t a;
-    a.gather(input0_dptr, idx);
-
-    // write out gathered elements in packed order
-    a.store_packed(output0_dptr);
-  });
+        // write out gathered elements in packed order
+        a.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[input1_vec[lane]], output0_vec[lane]);
   }
 
@@ -81,36 +85,40 @@ void GatherImpl()
   // Check partial length operations
   //
 
-  for(camp::idx_t N = 0;N <= num_elem;++ N){
+  for (camp::idx_t N = 0; N <= num_elem; ++N)
+  {
 
     // operator z[i] = a[b[i]]
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-      // get offsets
-      int_register_t idx;
-      idx.load_packed_n(input1_dptr, N);
-
-      // gather elements from a given offsets in idx
-      register_t a;
-      a.gather_n(input0_dptr, idx, N);
-
-      // write out gathered elements in packed order
-      // we're writing out entire length to check the zeroing
-      a.store_packed(output0_dptr);
-    });
+    tensor_do<policy_t>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          // get offsets
+          int_register_t idx;
+          idx.load_packed_n(input1_dptr, N);
+
+          // gather elements from a given offsets in idx
+          register_t a;
+          a.gather_n(input0_dptr, idx, N);
+
+          // write out gathered elements in packed order
+          // we're writing out entire length to check the zeroing
+          a.store_packed(output0_dptr);
+        });
 
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
 
-    for(camp::idx_t lane = 0;lane < num_elem;++ lane){
-      if(lane < N){
+    for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+    {
+      if (lane < N)
+      {
         ASSERT_SCALAR_EQ(input0_vec[input1_vec[lane]], output0_vec[lane]);
       }
-      else{
+      else
+      {
         ASSERT_SCALAR_EQ((element_t)0, output0_vec[lane]);
       }
     }
-
   }
 
 
@@ -121,11 +129,7 @@ void GatherImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Gather)
-{
-  GatherImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Gather) { GatherImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-GetSet.hpp b/test/functional/tensor/register/tests/test-tensor-register-GetSet.hpp
index 194412d999..b735c05ece 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-GetSet.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-GetSet.hpp
@@ -8,51 +8,55 @@
 #ifndef __TEST_TENSOR_REGISTER_GetSet_HPP__
 #define __TEST_TENSOR_REGISTER_GetSet_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void GetSetImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
   // Test set and get operations
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    // fill x using set
-    register_t x;
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      x.set(input0_dptr[i], i);
-    }
-
-    // extract from x using get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      output0_dptr[i] = x.get(i);
-    }
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // fill x using set
+        register_t x;
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          x.set(input0_dptr[i], i);
+        }
+
+        // extract from x using get
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          output0_dptr[i] = x.get(i);
+        }
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
     ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
   }
 
@@ -60,158 +64,161 @@ void GetSetImpl()
   //
   // test copy construction
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    // fill x using set
-    register_t x;
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      x.set(input0_dptr[i], i);
-    }
-
-    register_t cc(x);
-
-    // extract from x using get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      output0_dptr[i] = cc.get(i);
-    }
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // fill x using set
+        register_t x;
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          x.set(input0_dptr[i], i);
+        }
+
+        register_t cc(x);
+
+        // extract from x using get
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          output0_dptr[i] = cc.get(i);
+        }
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
     ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
   }
 
 
-
-
   //
   // test explicit copy
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    // fill x using set
-    register_t x;
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      x.set(input0_dptr[i], i);
-    }
-
-    register_t cc;
-    cc.copy(x);
-
-    // extract from x using get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      output0_dptr[i] = cc.get(i);
-    }
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // fill x using set
+        register_t x;
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          x.set(input0_dptr[i], i);
+        }
+
+        register_t cc;
+        cc.copy(x);
+
+        // extract from x using get
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          output0_dptr[i] = cc.get(i);
+        }
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
     ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
   }
 
 
-
-
   //
   // test assignment
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    // fill x using set
-    register_t x;
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      x.set(input0_dptr[i], i);
-    }
-
-    register_t cc = x;
-
-    // extract from x using get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      output0_dptr[i] = cc.get(i);
-    }
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // fill x using set
+        register_t x;
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          x.set(input0_dptr[i], i);
+        }
+
+        register_t cc = x;
+
+        // extract from x using get
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          output0_dptr[i] = cc.get(i);
+        }
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
     ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
   }
 
 
-
-
   //
   // test scalar construction (broadcast)
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-
-    register_t cc = (element_t) 5;
-
-    // extract from x using get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      output0_dptr[i] = cc.get(i);
-    }
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t cc = (element_t)5;
+
+        // extract from x using get
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          output0_dptr[i] = cc.get(i);
+        }
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
     ASSERT_SCALAR_EQ(output0_vec[i], (element_t)5);
   }
 
 
-
-
-
   //
   // test scalar broadcast by assignment
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-
-    register_t cc = (element_t) 0;
-    cc = (element_t) 11.0;
-
-    // extract from x using get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      output0_dptr[i] = cc.get(i);
-    }
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t cc = (element_t)0;
+        cc            = (element_t)11.0;
+
+        // extract from x using get
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          output0_dptr[i] = cc.get(i);
+        }
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
     ASSERT_SCALAR_EQ(output0_vec[i], (element_t)11);
   }
 
 
-
   //
   // test scalar explicit broadcast
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    register_t cc = (element_t) 0;
-    cc.broadcast(13.0);
-
-    // extract from x using get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      output0_dptr[i] = cc.get(i);
-    }
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t cc = (element_t)0;
+        cc.broadcast(13.0);
+
+        // extract from x using get
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          output0_dptr[i] = cc.get(i);
+        }
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
     ASSERT_SCALAR_EQ(output0_vec[i], (element_t)13);
   }
 
@@ -224,11 +231,7 @@ void GetSetImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, GetSet)
-{
-  GetSetImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, GetSet) { GetSetImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Load.hpp b/test/functional/tensor/register/tests/test-tensor-register-Load.hpp
index afe738b037..768965aad0 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Load.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Load.hpp
@@ -8,131 +8,140 @@
 #ifndef __TEST_TENSOR_REGISTER_Load_HPP__
 #define __TEST_TENSOR_REGISTER_Load_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void LoadImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
-  std::vector<element_t> input0_vec(10*num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(10*num_elem);
+  std::vector<element_t> input0_vec(10 * num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(10 * num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < 10*num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < 10 * num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
 
-
-
   // load stride-1 from pointer
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    // fill x using set
-    register_t x;
-    x.load_packed(input0_dptr);
-
-    // extract from x using get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      output0_dptr[i] = x.get(i);
-    }
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // fill x using set
+        register_t x;
+        x.load_packed(input0_dptr);
+
+        // extract from x using get
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          output0_dptr[i] = x.get(i);
+        }
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
     ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
   }
 
 
-
-  for(camp::idx_t N = 0;N < num_elem; ++ N){
+  for (camp::idx_t N = 0; N < num_elem; ++N)
+  {
     // load stride-1 from pointer
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-      // fill x using set
-      register_t x;
-      x.load_packed_n(input0_dptr, N);
-
-      // extract from x using get
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
-        output0_dptr[i] = x.get(i);
-      }
-
-    });
+    tensor_do<policy_t>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          // fill x using set
+          register_t x;
+          x.load_packed_n(input0_dptr, N);
+
+          // extract from x using get
+          for (camp::idx_t i = 0; i < num_elem; ++i)
+          {
+            output0_dptr[i] = x.get(i);
+          }
+        });
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
     // check that we were able to copy using set/get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      if(i < N){
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
+      if (i < N)
+      {
         ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
       }
-      else{
+      else
+      {
         ASSERT_SCALAR_EQ(output0_vec[i], (element_t)0);
       }
     }
   }
 
 
-
-
   // load stride-2 from pointer
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    // fill x using set
-    register_t x;
-    x.load_strided(input0_dptr, 2);
-
-    // extract from x using get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      output0_dptr[i] = x.get(i);
-    }
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // fill x using set
+        register_t x;
+        x.load_strided(input0_dptr, 2);
+
+        // extract from x using get
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          output0_dptr[i] = x.get(i);
+        }
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-    ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i*2]);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i * 2]);
   }
 
 
-
-  for(camp::idx_t N = 0;N < num_elem; ++ N){
+  for (camp::idx_t N = 0; N < num_elem; ++N)
+  {
     // load stride-2 from pointer
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-      // fill x using set
-      register_t x;
-      x.load_strided_n(input0_dptr, 2, N);
-
-      // extract from x using get
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
-        output0_dptr[i] = x.get(i);
-      }
-
-    });
+    tensor_do<policy_t>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          // fill x using set
+          register_t x;
+          x.load_strided_n(input0_dptr, 2, N);
+
+          // extract from x using get
+          for (camp::idx_t i = 0; i < num_elem; ++i)
+          {
+            output0_dptr[i] = x.get(i);
+          }
+        });
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
     // check that we were able to copy using set/get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      if(i < N){
-        ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i*2]);
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
+      if (i < N)
+      {
+        ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i * 2]);
       }
-      else{
+      else
+      {
         ASSERT_SCALAR_EQ(output0_vec[i], (element_t)0);
       }
     }
@@ -147,11 +156,7 @@ void LoadImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Load)
-{
-  LoadImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Load) { LoadImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Max.hpp b/test/functional/tensor/register/tests/test-tensor-register-Max.hpp
index f4bce2e7a9..319e0cac57 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Max.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Max.hpp
@@ -8,38 +8,39 @@
 #ifndef __TEST_TENSOR_REGISTER_Max_HPP__
 #define __TEST_TENSOR_REGISTER_Max_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void MaxImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_hptr = input1_vec.data();
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_hptr = input1_vec.data();
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(1);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(1);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(1);
 
   std::vector<element_t> output1_vec(num_elem);
-  element_t *output1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-    input0_hptr[i] = (element_t)(rand()*1000/RAND_MAX);
-    input1_hptr[i] = (element_t)(rand()*1000/RAND_MAX);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(rand() * 1000 / RAND_MAX);
+    input1_hptr[i] = (element_t)(rand() * 1000 / RAND_MAX);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -50,33 +51,34 @@ void MaxImpl()
   //  Check full-length operations
   //
 
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // load input vectors
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    // load input vectors
-    register_t x;
-    x.load_packed(input0_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
 
+        // compute reduction
+        output0_dptr[0] = x.max();
 
-    // compute reduction
-    output0_dptr[0] = x.max();
 
-
-    // compute element-wise
-    register_t z = x.vmax(y);
-    z.store_packed(output1_dptr);
-  });
+        // compute element-wise
+        register_t z = x.vmax(y);
+        z.store_packed(output1_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
   tensor_copy_to_host<policy_t>(output1_vec, output1_dptr);
 
 
-
   // compute expected value for reduction
   element_t expected = input0_vec[0];
-  for(camp::idx_t i = 1;i < num_elem;++i){
+  for (camp::idx_t i = 1; i < num_elem; ++i)
+  {
     expected = expected < input0_vec[i] ? input0_vec[i] : expected;
   }
 
@@ -85,40 +87,43 @@ void MaxImpl()
 
 
   // check element-wise operation
-  for(camp::idx_t i = 0;i < num_elem;++i){
-    ASSERT_SCALAR_EQ(std::max<element_t>(input0_vec[i], input1_vec[i]), output1_vec[i]);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    ASSERT_SCALAR_EQ(std::max<element_t>(input0_vec[i], input1_vec[i]),
+                     output1_vec[i]);
   }
 
 
   //
   // check variable length operator
   //
-  for(camp::idx_t N = 0;N <= num_elem;++ N){
+  for (camp::idx_t N = 0; N <= num_elem; ++N)
+  {
     //
     //  Check full-length operations
     //
 
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-      register_t x;
-      x.load_packed(input0_dptr);
-
-      output0_dptr[0] = x.max_n(N);
+    tensor_do<policy_t>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          register_t x;
+          x.load_packed(input0_dptr);
 
-    });
+          output0_dptr[0] = x.max_n(N);
+        });
 
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
 
     // compute expected value for reduction
     element_t expected = RAJA::operators::limits<element_t>::min();
-    for(camp::idx_t i = 0;i < N;++i){
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
       expected = expected < input0_vec[i] ? input0_vec[i] : expected;
     }
 
     // check reduction
     ASSERT_SCALAR_EQ(expected, output0_vec[0]);
-
   }
 
   // Cleanup
@@ -129,11 +134,7 @@ void MaxImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Max)
-{
-  MaxImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Max) { MaxImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Min.hpp b/test/functional/tensor/register/tests/test-tensor-register-Min.hpp
index 957d9fbf1d..aad3a0333c 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Min.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Min.hpp
@@ -8,37 +8,38 @@
 #ifndef __TEST_TENSOR_REGISTER_Min_HPP__
 #define __TEST_TENSOR_REGISTER_Min_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void MinImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(1);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(1);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(1);
 
   std::vector<element_t> output1_vec(num_elem);
-  element_t *output1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(rand()*1000/RAND_MAX);
-   input0_hptr[i] = (element_t)(rand()*1000/RAND_MAX);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(rand() * 1000 / RAND_MAX);
+    input0_hptr[i] = (element_t)(rand() * 1000 / RAND_MAX);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -49,33 +50,34 @@ void MinImpl()
   //  Check full-length operations
   //
 
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // load input vectors
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    // load input vectors
-    register_t x;
-    x.load_packed(input0_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
 
+        // compute reduction
+        output0_dptr[0] = x.min();
 
-    // compute reduction
-    output0_dptr[0] = x.min();
 
-
-    // compute element-wise
-    register_t z = x.vmin(y);
-    z.store_packed(output1_dptr);
-  });
+        // compute element-wise
+        register_t z = x.vmin(y);
+        z.store_packed(output1_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
   tensor_copy_to_host<policy_t>(output1_vec, output1_dptr);
 
 
-
   // compute expected value for reduction
   element_t expected = input0_vec[0];
-  for(camp::idx_t i = 1;i < num_elem;++i){
+  for (camp::idx_t i = 1; i < num_elem; ++i)
+  {
     expected = expected > input0_vec[i] ? input0_vec[i] : expected;
   }
 
@@ -84,40 +86,43 @@ void MinImpl()
 
 
   // check element-wise operation
-  for(camp::idx_t i = 0;i < num_elem;++i){
-    ASSERT_SCALAR_EQ(std::min<element_t>(input0_vec[i], input1_vec[i]), output1_vec[i]);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    ASSERT_SCALAR_EQ(std::min<element_t>(input0_vec[i], input1_vec[i]),
+                     output1_vec[i]);
   }
 
 
   //
   // check variable length operator
   //
-  for(camp::idx_t N = 0;N <= num_elem;++ N){
+  for (camp::idx_t N = 0; N <= num_elem; ++N)
+  {
     //
     //  Check full-length operations
     //
 
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-      register_t x;
-      x.load_packed(input0_dptr);
-
-      output0_dptr[0] = x.min_n(N);
+    tensor_do<policy_t>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          register_t x;
+          x.load_packed(input0_dptr);
 
-    });
+          output0_dptr[0] = x.min_n(N);
+        });
 
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
 
     // compute expected value for reduction
     element_t expected = RAJA::operators::limits<element_t>::max();
-    for(camp::idx_t i = 0;i < N;++i){
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
       expected = expected > input0_vec[i] ? input0_vec[i] : expected;
     }
 
     // check reduction
     ASSERT_SCALAR_EQ(expected, output0_vec[0]);
-
   }
 
   // Cleanup
@@ -128,11 +133,7 @@ void MinImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Min)
-{
-  MinImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Min) { MinImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Multiply.hpp b/test/functional/tensor/register/tests/test-tensor-register-Multiply.hpp
index 0ed4d4ad39..33072a50e5 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Multiply.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Multiply.hpp
@@ -8,35 +8,36 @@
 #ifndef __TEST_TENSOR_REGISTER_Multiply_HPP__
 #define __TEST_TENSOR_REGISTER_Multiply_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void MultiplyImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_hptr = input1_vec.data();
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_hptr = input1_vec.data();
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -48,93 +49,95 @@ void MultiplyImpl()
   //
 
   // operator *
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t x;
-    x.load_packed(input0_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
+        register_t z = x * y;
 
-    register_t z = x * y;
-
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] * input1_vec[lane], output0_vec[lane]);
   }
 
 
-
   // operator *=
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t x;
-    x.load_packed(input0_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
+        register_t z = x;
 
-    register_t z = x;
+        z *= y;
 
-    z *= y;
-
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] * input1_vec[lane], output0_vec[lane]);
   }
 
 
-
-
   // operator * scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t z = x * 7;
+        register_t z = x * 7;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] * 7, output0_vec[lane]);
   }
 
 
-
-
   // operator *= scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t z = x;
+        register_t z = x;
 
-    z *= 3;
+        z *= 3;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] * 3, output0_vec[lane]);
   }
 
 
-
   // Cleanup
   tensor_free<policy_t>(input0_dptr);
   tensor_free<policy_t>(input1_dptr);
@@ -142,11 +145,7 @@ void MultiplyImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Multiply)
-{
-  MultiplyImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Multiply) { MultiplyImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Scatter.hpp b/test/functional/tensor/register/tests/test-tensor-register-Scatter.hpp
index dc27f15b7b..23a29a9bd7 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Scatter.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Scatter.hpp
@@ -8,44 +8,45 @@
 #ifndef __TEST_TENSOR_REGISTER_Scatter_HPP__
 #define __TEST_TENSOR_REGISTER_Scatter_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void ScatterImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // get the integer indexing types
   using int_register_t = typename register_t::int_vector_type;
-  using index_t = typename int_register_t::element_type;
+  using index_t        = typename int_register_t::element_type;
 
   // Allocate
 
   // Data to be read
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   // Indexing into output0
   std::vector<index_t> input1_vec(num_elem);
-  index_t *input1_hptr = input1_vec.data();
-  index_t *input1_dptr = tensor_malloc<policy_t, index_t>(num_elem);
+  index_t* input1_hptr = input1_vec.data();
+  index_t* input1_dptr = tensor_malloc<policy_t, index_t>(num_elem);
 
   // Scattered output (10x larger than output)
-  std::vector<element_t> output0_vec(10*num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(10*num_elem);
+  std::vector<element_t> output0_vec(10 * num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(10 * num_elem);
 
   // precomputed expected output
-  std::vector<element_t> expected(10*num_elem);
+  std::vector<element_t> expected(10 * num_elem);
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-   input1_hptr[i] = (index_t)(3*i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    input1_hptr[i] = (index_t)(3 * i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -53,8 +54,9 @@ void ScatterImpl()
 
 
   // Initialize output
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   output0_vec[i] = (element_t)0;
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    output0_vec[i] = (element_t)0;
   }
   tensor_copy_to_device<policy_t>(output0_dptr, output0_vec);
 
@@ -64,29 +66,33 @@ void ScatterImpl()
   //
 
   // operator z[b[i]] = a[i]
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        int_register_t idx;
+        idx.load_packed(input1_dptr);
 
-    int_register_t idx;
-    idx.load_packed(input1_dptr);
+        register_t a;
+        a.load_packed(input0_dptr);
 
-    register_t a;
-    a.load_packed(input0_dptr);
-
-    a.scatter(output0_dptr, idx);
-  });
+        a.scatter(output0_dptr, idx);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // compute expected value
-  for(camp::idx_t lane = 0;lane < 10*num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < 10 * num_elem; ++lane)
+  {
     expected[lane] = 0;
   }
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     expected[input1_vec[lane]] = input0_vec[lane];
   }
 
   // check result
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(expected[lane], output0_vec[lane]);
   }
 
@@ -95,44 +101,48 @@ void ScatterImpl()
   // Check partial length operations
   //
 
-  for(camp::idx_t N = 0;N <= num_elem;++ N){
+  for (camp::idx_t N = 0; N <= num_elem; ++N)
+  {
 
     // Initialize output
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-     output0_vec[i] = (element_t)0;
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
+      output0_vec[i] = (element_t)0;
     }
     tensor_copy_to_device<policy_t>(output0_dptr, output0_vec);
 
 
-
     // operator z[i] = a[b[i]]
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+    tensor_do<policy_t>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          int_register_t idx;
+          idx.load_packed(input1_dptr);
 
-      int_register_t idx;
-      idx.load_packed(input1_dptr);
+          register_t a;
+          a.load_packed(input0_dptr);
 
-      register_t a;
-      a.load_packed(input0_dptr);
-
-      a.scatter_n(output0_dptr, idx, N);
-    });
+          a.scatter_n(output0_dptr, idx, N);
+        });
 
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
 
     // compute expected value
-    for(camp::idx_t lane = 0;lane < 10*num_elem;++ lane){
+    for (camp::idx_t lane = 0; lane < 10 * num_elem; ++lane)
+    {
       expected[lane] = 0;
     }
-    for(camp::idx_t lane = 0;lane < N;++ lane){
+    for (camp::idx_t lane = 0; lane < N; ++lane)
+    {
       expected[input1_vec[lane]] = input0_vec[lane];
     }
 
     // check result
-    for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+    for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+    {
       ASSERT_SCALAR_EQ(expected[lane], output0_vec[lane]);
     }
-
   }
 
 
@@ -143,11 +153,7 @@ void ScatterImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Scatter)
-{
-  ScatterImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Scatter) { ScatterImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastInner.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastInner.hpp
index c3394e981f..f843fc6ad9 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastInner.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastInner.hpp
@@ -8,57 +8,61 @@
 #ifndef __TEST_TENSOR_REGISTER_SegmentedBroadcastInner_HPP__
 #define __TEST_TENSOR_REGISTER_SegmentedBroadcastInner_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void SegmentedBroadcastInnerImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-//  printf("input: ");
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-    input0_hptr[i] = (element_t)(i+1); //+NO_OPT_RAND);
-//    printf("%lf ", (double)input0_hptr[i]);
+  //  printf("input: ");
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1);  //+NO_OPT_RAND);
+    //    printf("%lf ", (double)input0_hptr[i]);
   }
-//  printf("\n");
+  //  printf("\n");
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
 
-
   // run segmented dot products for all segments allowed by the vector
-  for(camp::idx_t segbits = 0;(1<<segbits) <= num_elem;++ segbits){
+  for (camp::idx_t segbits = 0; (1 << segbits) <= num_elem; ++segbits)
+  {
 
-    camp::idx_t num_segments = num_elem>>segbits;
+    camp::idx_t num_segments = num_elem >> segbits;
 
-    for(camp::idx_t input_segment = 0;input_segment < num_segments;++ input_segment){
-//      printf("segbits=%d, input_segment=%d\n", (camp::idx_t)segbits, (camp::idx_t)input_segment);
+    for (camp::idx_t input_segment = 0; input_segment < num_segments;
+         ++input_segment)
+    {
+      //      printf("segbits=%d, input_segment=%d\n", (camp::idx_t)segbits,
+      //      (camp::idx_t)input_segment);
 
       // Execute segmented broadcast
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-        register_t x;
-        x.load_packed(input0_dptr);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            register_t x;
+            x.load_packed(input0_dptr);
 
-        register_t y = x.segmented_broadcast_inner(segbits, input_segment);
+            register_t y = x.segmented_broadcast_inner(segbits, input_segment);
 
-        y.store_packed(output0_dptr);
-
-      });
+            y.store_packed(output0_dptr);
+          });
 
       // Move result to host
       tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
@@ -69,38 +73,40 @@ void SegmentedBroadcastInnerImpl()
       // Compute expected values
       element_t expected[num_elem];
 
-      camp::idx_t mask = (1<<segbits)-1;
+      camp::idx_t mask   = (1 << segbits) - 1;
       camp::idx_t offset = input_segment << segbits;
 
       // default implementation is dumb, just sum each value into
       // appropriate segment lane
-//      printf("Expected: ");
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      //      printf("Expected: ");
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
 
-        auto off = (i&mask) + offset;
+        auto off = (i & mask) + offset;
 
         expected[i] = input0_hptr[off];
 
-//        printf("%d ", (camp::idx_t)off);
-        //printf("%lf ", (double)expected[i]);
+        //        printf("%d ", (camp::idx_t)off);
+        // printf("%lf ", (double)expected[i]);
       }
-//      printf("\n");
+      //      printf("\n");
 
 
-//      printf("Result:   ");
-//      for(camp::idx_t i = 0;i < num_elem; ++ i){
-//        printf("%lf ", (double)output0_vec[i]);
-//      }
-//      printf("\n");
+      //      printf("Result:   ");
+      //      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      //        printf("%lf ", (double)output0_vec[i]);
+      //      }
+      //      printf("\n");
 
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
 
         ASSERT_SCALAR_EQ(expected[i], output0_vec[i]);
       }
 
-    } // segment
+    }  // segment
 
-  } // segbits
+  }  // segbits
 
 
   // Cleanup
@@ -109,7 +115,6 @@ void SegmentedBroadcastInnerImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorRegister, SegmentedBroadcastInner)
 {
   SegmentedBroadcastInnerImpl<TypeParam>();
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastOuter.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastOuter.hpp
index 45c5739af0..aca677b975 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastOuter.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastOuter.hpp
@@ -8,56 +8,59 @@
 #ifndef __TEST_TENSOR_REGISTER_SegmentedBroadcastOuter_HPP__
 #define __TEST_TENSOR_REGISTER_SegmentedBroadcastOuter_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void SegmentedBroadcastOuterImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-//  printf("input: ");
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-    input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-//    printf("%lf ", (double)input0_hptr[i]);
+  //  printf("input: ");
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    //    printf("%lf ", (double)input0_hptr[i]);
   }
-//  printf("\n");
+  //  printf("\n");
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
 
-
   // run segmented dot products for all segments allowed by the vector
-  for(camp::idx_t segbits = 0;(1<<segbits) <= num_elem;++ segbits){
+  for (camp::idx_t segbits = 0; (1 << segbits) <= num_elem; ++segbits)
+  {
 
-    camp::idx_t num_segments = (1<<segbits);
+    camp::idx_t num_segments = (1 << segbits);
 
-    for(camp::idx_t input_segment = 0;input_segment < num_segments;++ input_segment){
+    for (camp::idx_t input_segment = 0; input_segment < num_segments;
+         ++input_segment)
+    {
 
       // Execute segmented broadcast
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-        register_t x;
-        x.load_packed(input0_dptr);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            register_t x;
+            x.load_packed(input0_dptr);
 
-        register_t y = x.segmented_broadcast_outer(segbits, input_segment);
+            register_t y = x.segmented_broadcast_outer(segbits, input_segment);
 
-        y.store_packed(output0_dptr);
-
-      });
+            y.store_packed(output0_dptr);
+          });
 
       // Move result to host
       tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
@@ -66,28 +69,30 @@ void SegmentedBroadcastOuterImpl()
       // Check result
 
       // Compute expected values
-//      printf("explode: segbits=%d, input_segment=%d\n", segbits, input_segment);
-//      printf("  expected:  ");
+      //      printf("explode: segbits=%d, input_segment=%d\n", segbits,
+      //      input_segment); printf("  expected:  ");
 
       element_t expected[num_elem];
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
-        camp::idx_t seg = i>>segbits;
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
+        camp::idx_t seg = i >> segbits;
 
-        camp::idx_t off = (num_elem>>segbits)*input_segment + seg;
+        camp::idx_t off = (num_elem >> segbits) * input_segment + seg;
 
         expected[i] = input0_hptr[off];
-//        printf("%lf ", (double)expected[i]);
+        //        printf("%lf ", (double)expected[i]);
       }
-//      printf("\n");
+      //      printf("\n");
 
 
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
         ASSERT_SCALAR_EQ(expected[i], output0_vec[i]);
       }
 
-    } // segment
+    }  // segment
 
-  } // segbits
+  }  // segbits
 
 
   // Cleanup
@@ -96,7 +101,6 @@ void SegmentedBroadcastOuterImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorRegister, SegmentedBroadcastOuter)
 {
   SegmentedBroadcastOuterImpl<TypeParam>();
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedDotProduct.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedDotProduct.hpp
index d8243864e8..4332cf3430 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedDotProduct.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedDotProduct.hpp
@@ -8,62 +8,65 @@
 #ifndef __TEST_TENSOR_REGISTER_SegmentedDotProduct_HPP__
 #define __TEST_TENSOR_REGISTER_SegmentedDotProduct_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void SegmentedDotProductImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_hptr = input1_vec.data();
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_hptr = input1_vec.data();
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
   tensor_copy_to_device<policy_t>(input1_dptr, input1_vec);
 
 
-
   // run segmented dot products for all segments allowed by the vector
-  for(camp::idx_t segbits = 0;(1<<segbits) <= num_elem;++ segbits){
-
-    camp::idx_t num_output_segments = 1<<segbits;
-
-    for(camp::idx_t output_segment = 0;output_segment < num_output_segments;++output_segment){
+  for (camp::idx_t segbits = 0; (1 << segbits) <= num_elem; ++segbits)
+  {
 
+    camp::idx_t num_output_segments = 1 << segbits;
 
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+    for (camp::idx_t output_segment = 0; output_segment < num_output_segments;
+         ++output_segment)
+    {
 
-        register_t x;
-        x.load_packed(input0_dptr);
 
-        register_t y;
-        y.load_packed(input1_dptr);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            register_t x;
+            x.load_packed(input0_dptr);
 
-        register_t dp = x.segmented_dot(segbits, output_segment, y);
-        dp.store_packed(output0_dptr);
+            register_t y;
+            y.load_packed(input1_dptr);
 
-      });
+            register_t dp = x.segmented_dot(segbits, output_segment, y);
+            dp.store_packed(output0_dptr);
+          });
 
 
       // Move result to host
@@ -72,23 +75,25 @@ void SegmentedDotProductImpl()
       // Compute expected values
       std::vector<element_t> expected(num_elem);
 
-      camp::idx_t offset = output_segment * num_elem/(1<<segbits);
+      camp::idx_t offset = output_segment * num_elem / (1 << segbits);
 
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
         expected[i] = 0;
       }
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
-        expected[(i>>segbits) + offset] += input0_vec[i]*input1_vec[i];
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
+        expected[(i >> segbits) + offset] += input0_vec[i] * input1_vec[i];
       }
 
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
         ASSERT_SCALAR_EQ(expected[i], output0_vec[i]);
       }
 
-    } // output_segment
-
-  } // segbits
+    }  // output_segment
 
+  }  // segbits
 
 
   // Cleanup
@@ -98,7 +103,6 @@ void SegmentedDotProductImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorRegister, SegmentedDotProduct)
 {
   SegmentedDotProductImpl<TypeParam>();
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumInner.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumInner.hpp
index 2cfda47bcd..e0e45f428c 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumInner.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumInner.hpp
@@ -8,54 +8,57 @@
 #ifndef __TEST_TENSOR_REGISTER_SegmentedSumInner_HPP__
 #define __TEST_TENSOR_REGISTER_SegmentedSumInner_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void SegmentedSumInnerImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-    input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
 
-
   // run segmented dot products for all segments allowed by the vector
-  for(camp::idx_t segbits = 0;(1<<segbits) <= num_elem;++ segbits){
+  for (camp::idx_t segbits = 0; (1 << segbits) <= num_elem; ++segbits)
+  {
 
-    camp::idx_t num_segments = 1<<segbits;
+    camp::idx_t num_segments = 1 << segbits;
 
-    for(camp::idx_t output_segment = 0;output_segment < num_segments;++ output_segment){
+    for (camp::idx_t output_segment = 0; output_segment < num_segments;
+         ++output_segment)
+    {
 
       // Execute segmented broadcast
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-        register_t x;
-        x.load_packed(input0_dptr);
-
-        register_t y = x.segmented_sum_inner(segbits, output_segment);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            register_t x;
+            x.load_packed(input0_dptr);
 
-        y.store_packed(output0_dptr);
+            register_t y = x.segmented_sum_inner(segbits, output_segment);
 
-      });
+            y.store_packed(output0_dptr);
+          });
 
       // Move result to host
       tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
@@ -65,30 +68,32 @@ void SegmentedSumInnerImpl()
 
       // Compute expected values
       element_t expected[num_elem];
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
         expected[i] = 0;
       }
 
-      camp::idx_t output_offset = output_segment * num_elem>>segbits;
+      camp::idx_t output_offset = output_segment * num_elem >> segbits;
 
       // sum each value into appropriate segment lane
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
 
-        auto off = (i >> segbits)+output_offset;
+        auto off = (i >> segbits) + output_offset;
 
         expected[off] += input0_hptr[i];
       }
 
 
-
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
 
         ASSERT_SCALAR_EQ(expected[i], output0_vec[i]);
       }
 
-    } // segment
+    }  // segment
 
-  } // segbits
+  }  // segbits
 
 
   // Cleanup
@@ -97,7 +102,6 @@ void SegmentedSumInnerImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorRegister, SegmentedSumInner)
 {
   SegmentedSumInnerImpl<TypeParam>();
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumOuter.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumOuter.hpp
index 6ce6f2a6e3..484f9e198a 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumOuter.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumOuter.hpp
@@ -8,53 +8,56 @@
 #ifndef __TEST_TENSOR_REGISTER_SegmentedSumOuter_HPP__
 #define __TEST_TENSOR_REGISTER_SegmentedSumOuter_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void SegmentedSumOuterImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-    input0_hptr[i] = (element_t)(i+1); //+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1);  //+NO_OPT_RAND);
   }
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
 
-
   // run segmented dot products for all segments allowed by the vector
-  for(camp::idx_t segbits = 0;(1<<segbits) <= num_elem;++ segbits){
+  for (camp::idx_t segbits = 0; (1 << segbits) <= num_elem; ++segbits)
+  {
 
-    camp::idx_t num_segments = num_elem>>segbits;
+    camp::idx_t num_segments = num_elem >> segbits;
 
-    for(camp::idx_t output_segment = 0;output_segment < num_segments;++ output_segment){
+    for (camp::idx_t output_segment = 0; output_segment < num_segments;
+         ++output_segment)
+    {
 
       // Execute segmented broadcast
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-        register_t x;
-        x.load_packed(input0_dptr);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            register_t x;
+            x.load_packed(input0_dptr);
 
-        register_t y = x.segmented_sum_outer(segbits, output_segment);
+            register_t y = x.segmented_sum_outer(segbits, output_segment);
 
-        y.store_packed(output0_dptr);
-
-      });
+            y.store_packed(output0_dptr);
+          });
 
       // Move result to host
       tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
@@ -64,26 +67,29 @@ void SegmentedSumOuterImpl()
 
       // Compute expected values
       element_t expected[num_elem];
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
         expected[i] = 0;
       }
 
-      camp::idx_t output_offset = output_segment * (1<<segbits);
+      camp::idx_t output_offset = output_segment * (1 << segbits);
 
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
-        camp::idx_t output_i = output_offset + i%(1<<segbits);
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
+        camp::idx_t output_i = output_offset + i % (1 << segbits);
         expected[output_i] += input0_hptr[i];
       }
 
 
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
 
         ASSERT_SCALAR_EQ(expected[i], output0_vec[i]);
       }
 
-    } // segment
+    }  // segment
 
-  } // segbits
+  }  // segbits
 
 
   // Cleanup
@@ -92,7 +98,6 @@ void SegmentedSumOuterImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorRegister, SegmentedSumOuter)
 {
   SegmentedSumOuterImpl<TypeParam>();
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Store.hpp b/test/functional/tensor/register/tests/test-tensor-register-Store.hpp
index ac508fb0d6..ca341f74d8 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Store.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Store.hpp
@@ -8,156 +8,171 @@
 #ifndef __TEST_TENSOR_REGISTER_Store_HPP__
 #define __TEST_TENSOR_REGISTER_Store_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void StoreImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
-  std::vector<element_t> output0_vec(10*num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(10*num_elem);
+  std::vector<element_t> output0_vec(10 * num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(10 * num_elem);
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
 
   // Initialize output
-  for(camp::idx_t i = 0;i < 10*num_elem; ++ i){
+  for (camp::idx_t i = 0; i < 10 * num_elem; ++i)
+  {
     output0_vec[i] = (element_t)0;
   }
   tensor_copy_to_device<policy_t>(output0_dptr, output0_vec);
 
 
   // store stride-1 to pointer
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    // fill x
-    register_t x;
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      x.set(input0_dptr[i], i);
-    }
-
-    x.store_packed(output0_dptr);
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // fill x
+        register_t x;
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          x.set(input0_dptr[i], i);
+        }
+
+        x.store_packed(output0_dptr);
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
     ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
   }
 
 
-
-  for(camp::idx_t N = 0;N < num_elem; ++ N){
+  for (camp::idx_t N = 0; N < num_elem; ++N)
+  {
 
     // Initialize output
-    for(camp::idx_t i = 0;i < 10*num_elem; ++ i){
+    for (camp::idx_t i = 0; i < 10 * num_elem; ++i)
+    {
       output0_vec[i] = (element_t)0;
     }
     tensor_copy_to_device<policy_t>(output0_dptr, output0_vec);
 
 
     // load stride-1 from pointer
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-      // fill x
-      register_t x;
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
-        x.set(input0_dptr[i], i);
-      }
-
-      x.store_packed_n(output0_dptr, N);
-
-    });
+    tensor_do<policy_t>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          // fill x
+          register_t x;
+          for (camp::idx_t i = 0; i < num_elem; ++i)
+          {
+            x.set(input0_dptr[i], i);
+          }
+
+          x.store_packed_n(output0_dptr, N);
+        });
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
     // check that we were able to copy using set/get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      if(i < N){
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
+      if (i < N)
+      {
         ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
       }
-      else{
+      else
+      {
         ASSERT_SCALAR_EQ(output0_vec[i], (element_t)0);
       }
     }
   }
 
 
-
   // Initialize output
-  for(camp::idx_t i = 0;i < 10*num_elem; ++ i){
+  for (camp::idx_t i = 0; i < 10 * num_elem; ++i)
+  {
     output0_vec[i] = (element_t)0;
   }
   tensor_copy_to_device<policy_t>(output0_dptr, output0_vec);
 
 
   // load stride-2 from pointer
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    // fill x
-    register_t x;
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      x.set(input0_dptr[i], i);
-    }
-
-    x.store_strided(output0_dptr, 2);
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // fill x
+        register_t x;
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          x.set(input0_dptr[i], i);
+        }
+
+        x.store_strided(output0_dptr, 2);
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-    ASSERT_SCALAR_EQ(output0_vec[2*i], input0_vec[i]);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    ASSERT_SCALAR_EQ(output0_vec[2 * i], input0_vec[i]);
   }
 
 
-
-  for(camp::idx_t N = 0;N < num_elem; ++ N){
+  for (camp::idx_t N = 0; N < num_elem; ++N)
+  {
 
     // Initialize output
-    for(camp::idx_t i = 0;i < 10*num_elem; ++ i){
+    for (camp::idx_t i = 0; i < 10 * num_elem; ++i)
+    {
       output0_vec[i] = (element_t)0;
     }
     tensor_copy_to_device<policy_t>(output0_dptr, output0_vec);
 
 
-
     // load stride-2 from pointer
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-      // fill x
-      register_t x;
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
-        x.set(input0_dptr[i], i);
-      }
-
-      x.store_strided_n(output0_dptr, 2, N);
-
-    });
+    tensor_do<policy_t>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          // fill x
+          register_t x;
+          for (camp::idx_t i = 0; i < num_elem; ++i)
+          {
+            x.set(input0_dptr[i], i);
+          }
+
+          x.store_strided_n(output0_dptr, 2, N);
+        });
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
     // check that we were able to copy using set/get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      if(i < N){
-        ASSERT_SCALAR_EQ(output0_vec[2*i], input0_vec[i]);
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
+      if (i < N)
+      {
+        ASSERT_SCALAR_EQ(output0_vec[2 * i], input0_vec[i]);
       }
-      else{
-        ASSERT_SCALAR_EQ(output0_vec[2*i], (element_t)0);
+      else
+      {
+        ASSERT_SCALAR_EQ(output0_vec[2 * i], (element_t)0);
       }
     }
   }
@@ -171,11 +186,7 @@ void StoreImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Store)
-{
-  StoreImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Store) { StoreImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Subtract.hpp b/test/functional/tensor/register/tests/test-tensor-register-Subtract.hpp
index fb9a0efc92..2fdf1425d1 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Subtract.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Subtract.hpp
@@ -8,35 +8,36 @@
 #ifndef __TEST_TENSOR_REGISTER_Subtract_HPP__
 #define __TEST_TENSOR_REGISTER_Subtract_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void SubtractImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_hptr = input1_vec.data();
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_hptr = input1_vec.data();
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -48,93 +49,95 @@ void SubtractImpl()
   //
 
   // operator -
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t x;
-    x.load_packed(input0_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
+        register_t z = x - y;
 
-    register_t z = x - y;
-
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] - input1_vec[lane], output0_vec[lane]);
   }
 
 
-
   // operator -=
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t x;
-    x.load_packed(input0_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
+        register_t z = x;
 
-    register_t z = x;
+        z -= y;
 
-    z -= y;
-
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] - input1_vec[lane], output0_vec[lane]);
   }
 
 
-
-
   // operator - scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t z = x - 7;
+        register_t z = x - 7;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] - 7, output0_vec[lane]);
   }
 
 
-
-
   // operator -= scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t z = x;
+        register_t z = x;
 
-    z -= 3;
+        z -= 3;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] - 3, output0_vec[lane]);
   }
 
 
-
   // Cleanup
   tensor_free<policy_t>(input0_dptr);
   tensor_free<policy_t>(input1_dptr);
@@ -142,11 +145,7 @@ void SubtractImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Subtract)
-{
-  SubtractImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Subtract) { SubtractImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-CtorGetSet.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-CtorGetSet.hpp
index cbcf7c8783..d6c7e72dd8 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-CtorGetSet.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-CtorGetSet.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_VECTOR_CtorGetSet_HPP__
 #define __TEST_TENSOR_VECTOR_CtorGetSet_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename VECTOR_TYPE>
 void CtorGetSetImpl()
 {
 
-  using vector_t = VECTOR_TYPE;
-  using policy_t = typename vector_t::register_policy;
+  using vector_t  = VECTOR_TYPE;
+  using policy_t  = typename vector_t::register_policy;
   using element_t = typename vector_t::element_type;
 
 
@@ -23,12 +23,13 @@ void CtorGetSetImpl()
   std::vector<element_t> get(vector_t::s_num_elem);
   std::vector<element_t> set(vector_t::s_num_elem);
 
-  element_t * A_ptr = tensor_malloc<policy_t>(A);
-  element_t * get_ptr = tensor_malloc<policy_t>(get);
-  element_t * set_ptr = tensor_malloc<policy_t>(set);
+  element_t* A_ptr   = tensor_malloc<policy_t>(A);
+  element_t* get_ptr = tensor_malloc<policy_t>(get);
+  element_t* set_ptr = tensor_malloc<policy_t>(set);
 
-  for(camp::idx_t i = 0;i < vector_t::s_num_elem;++ i){
-    A[i] = (element_t)(i*2);
+  for (camp::idx_t i = 0; i < vector_t::s_num_elem; ++i)
+  {
+    A[i]   = (element_t)(i * 2);
     get[i] = 0;
     set[i] = 0;
   }
@@ -39,24 +40,29 @@ void CtorGetSetImpl()
 
   // For Fixed vectors, only try with fixed length
   // For Stream vectors, try all lengths
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-    for(camp::idx_t N = 1; N <= vector_t::s_num_elem; ++ N){
-      // load array A as vector
-      vector_t vec;
-      vec.load_packed_n(A_ptr, N);
-
-      // try get operations
-      for(camp::idx_t i = 0;i < N;++ i){
-        get_ptr[i] = vec.get(i);
-      }
-
-      // try set and get operations
-      for(camp::idx_t i = 0;i < N;++ i){
-        vec.set((element_t)(i+1), i);
-        set_ptr[i] = vec.get(i);
-      }
-    }
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        for (camp::idx_t N = 1; N <= vector_t::s_num_elem; ++N)
+        {
+          // load array A as vector
+          vector_t vec;
+          vec.load_packed_n(A_ptr, N);
+
+          // try get operations
+          for (camp::idx_t i = 0; i < N; ++i)
+          {
+            get_ptr[i] = vec.get(i);
+          }
+
+          // try set and get operations
+          for (camp::idx_t i = 0; i < N; ++i)
+          {
+            vec.set((element_t)(i + 1), i);
+            set_ptr[i] = vec.get(i);
+          }
+        }
+      });
 
 
   tensor_copy_to_host<policy_t>(get, get_ptr);
@@ -64,17 +70,19 @@ void CtorGetSetImpl()
 
   // For Fixed vectors, only try with fixed length
   // For Stream vectors, try all lengths
-  for(camp::idx_t N = 1; N <= vector_t::s_num_elem; ++ N){
+  for (camp::idx_t N = 1; N <= vector_t::s_num_elem; ++N)
+  {
 
     // check get operations
-    for(camp::idx_t i = 0;i < N;++ i){
-      ASSERT_SCALAR_EQ(get[i], (element_t)(i*2));
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ASSERT_SCALAR_EQ(get[i], (element_t)(i * 2));
     }
 
-    for(camp::idx_t i = 0;i < N;++ i){
-      ASSERT_SCALAR_EQ(set[i], (element_t)(i+1));
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ASSERT_SCALAR_EQ(set[i], (element_t)(i + 1));
     }
-
   }
 
   tensor_free<policy_t>(A_ptr);
@@ -83,11 +91,7 @@ void CtorGetSetImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorVector, CtorGetSet)
-{
-  CtorGetSetImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorVector, CtorGetSet) { CtorGetSetImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-FmaFms.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-FmaFms.hpp
index 61073f5cc3..a2489097d8 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-FmaFms.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-FmaFms.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_VECTOR_FmaFms_HPP__
 #define __TEST_TENSOR_VECTOR_FmaFms_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename VECTOR_TYPE>
 void FmaFmsImpl()
 {
 
-  using vector_t = VECTOR_TYPE;
-  using policy_t = typename vector_t::register_policy;
+  using vector_t  = VECTOR_TYPE;
+  using policy_t  = typename vector_t::register_policy;
   using element_t = typename vector_t::element_type;
 
   std::vector<element_t> A(vector_t::s_num_elem);
@@ -24,16 +24,17 @@ void FmaFmsImpl()
   std::vector<element_t> fma(vector_t::s_num_elem);
   std::vector<element_t> fms(vector_t::s_num_elem);
 
-  element_t * A_ptr = tensor_malloc<policy_t>(A);
-  element_t * B_ptr = tensor_malloc<policy_t>(B);
-  element_t * C_ptr = tensor_malloc<policy_t>(C);
-  element_t * fma_ptr = tensor_malloc<policy_t>(fma);
-  element_t * fms_ptr = tensor_malloc<policy_t>(fms);
-
-  for(camp::idx_t i = 0;i < vector_t::s_num_elem;++ i){
-    A[i] = (element_t)i;
-    B[i] = (element_t)i*2;
-    C[i] = (element_t)i*3;
+  element_t* A_ptr   = tensor_malloc<policy_t>(A);
+  element_t* B_ptr   = tensor_malloc<policy_t>(B);
+  element_t* C_ptr   = tensor_malloc<policy_t>(C);
+  element_t* fma_ptr = tensor_malloc<policy_t>(fma);
+  element_t* fms_ptr = tensor_malloc<policy_t>(fms);
+
+  for (camp::idx_t i = 0; i < vector_t::s_num_elem; ++i)
+  {
+    A[i]   = (element_t)i;
+    B[i]   = (element_t)i * 2;
+    C[i]   = (element_t)i * 3;
     fma[i] = 0;
     fms[i] = 0;
   }
@@ -46,53 +47,60 @@ void FmaFmsImpl()
 
   // For Fixed vectors, only try with fixed length
   // For Stream vectors, try all lengths
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-    for(camp::idx_t N = 1; N <= vector_t::s_num_elem; ++ N){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        for (camp::idx_t N = 1; N <= vector_t::s_num_elem; ++N)
+        {
 
-      // load arrays as vectors
-      vector_t vec_A;
-      vec_A.load_packed_n(A_ptr, N);
+          // load arrays as vectors
+          vector_t vec_A;
+          vec_A.load_packed_n(A_ptr, N);
 
-      vector_t vec_B;
-      vec_B.load_packed_n(B_ptr, N);
+          vector_t vec_B;
+          vec_B.load_packed_n(B_ptr, N);
 
-      vector_t vec_C;
-      vec_C.load_packed_n(C_ptr, N);
+          vector_t vec_C;
+          vec_C.load_packed_n(C_ptr, N);
 
 
-      // try FMA (A*B+C)
+          // try FMA (A*B+C)
 
-      vector_t fma = vec_A.multiply_add(vec_B, vec_C);
-      for(camp::idx_t i = 0;i < N;++ i){
-        fma_ptr[i] = fma.get(i);
-      }
+          vector_t fma = vec_A.multiply_add(vec_B, vec_C);
+          for (camp::idx_t i = 0; i < N; ++i)
+          {
+            fma_ptr[i] = fma.get(i);
+          }
 
-      // try FMS (A*B-C)
-      vector_t fms = vec_A.multiply_subtract(vec_B, vec_C);
-      for(camp::idx_t i = 0;i < N;++ i){
-        fms_ptr[i] = fms.get(i);
-      }
-    }
-  });
+          // try FMS (A*B-C)
+          vector_t fms = vec_A.multiply_subtract(vec_B, vec_C);
+          for (camp::idx_t i = 0; i < N; ++i)
+          {
+            fms_ptr[i] = fms.get(i);
+          }
+        }
+      });
 
   tensor_copy_to_host<policy_t>(fma, fma_ptr);
   tensor_copy_to_host<policy_t>(fms, fms_ptr);
 
   // For Fixed vectors, only try with fixed length
   // For Stream vectors, try all lengths
-  for(camp::idx_t N = 1; N <= vector_t::s_num_elem; ++ N){
+  for (camp::idx_t N = 1; N <= vector_t::s_num_elem; ++N)
+  {
 
     // check FMA (A*B+C)
 
-    for(camp::idx_t i = 0;i < N;++ i){
-      ASSERT_SCALAR_EQ(fma[i], A[i]*B[i]+C[i]);
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ASSERT_SCALAR_EQ(fma[i], A[i] * B[i] + C[i]);
     }
 
     // check FMS (A*B-C)
-    for(camp::idx_t i = 0;i < N;++ i){
-      ASSERT_SCALAR_EQ(fms[i], A[i]*B[i]-C[i]);
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ASSERT_SCALAR_EQ(fms[i], A[i] * B[i] - C[i]);
     }
-
   }
 
   tensor_free<policy_t>(A_ptr);
@@ -103,11 +111,7 @@ void FmaFmsImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorVector, FmaFms)
-{
-  FmaFmsImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorVector, FmaFms) { FmaFmsImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef1d.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef1d.hpp
index 854dcba8be..2f4269161c 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef1d.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef1d.hpp
@@ -8,35 +8,36 @@
 #ifndef __TEST_TENSOR_VECTOR_ForallVectorRef1d_HPP__
 #define __TEST_TENSOR_VECTOR_ForallVectorRef1d_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
-RAJA_INDEX_VALUE( TX, "TX" );
+RAJA_INDEX_VALUE(TX, "TX");
 
 template <typename VECTOR_TYPE>
 void ForallVectorRef1dImpl()
 {
 
-  using vector_t = VECTOR_TYPE;
-  using policy_t = typename vector_t::register_policy;
+  using vector_t  = VECTOR_TYPE;
+  using policy_t  = typename vector_t::register_policy;
   using element_t = typename vector_t::element_type;
 
 
-  size_t N = 10*vector_t::s_num_elem+1;
+  size_t N = 10 * vector_t::s_num_elem + 1;
   // If we are not using fixed vectors, add some random number of elements
   // to the array to test some postamble code generation.
-    //N += (size_t)(100*NO_OPT_RAND);
+  // N += (size_t)(100*NO_OPT_RAND);
 
   std::vector<element_t> A(N);
   std::vector<element_t> B(N);
   std::vector<element_t> C(N);
 
-  element_t * A_ptr = tensor_malloc<policy_t>(A);
-  element_t * B_ptr = tensor_malloc<policy_t>(B);
-  element_t * C_ptr = tensor_malloc<policy_t>(C);
+  element_t* A_ptr = tensor_malloc<policy_t>(A);
+  element_t* B_ptr = tensor_malloc<policy_t>(B);
+  element_t* C_ptr = tensor_malloc<policy_t>(C);
 
-  for(size_t i = 0;i < N; ++ i){
-    A[i] = (element_t)(NO_OPT_RAND*1000.0);
-    B[i] = (element_t)(NO_OPT_RAND*1000.0);
+  for (size_t i = 0; i < N; ++i)
+  {
+    A[i] = (element_t)(NO_OPT_RAND * 1000.0);
+    B[i] = (element_t)(NO_OPT_RAND * 1000.0);
     C[i] = 0.0;
   }
 
@@ -57,82 +58,85 @@ void ForallVectorRef1dImpl()
   tensor_copy_to_device<policy_t>(B_ptr, B);
   tensor_copy_to_device<policy_t>(C_ptr, C);
 
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-    Z_d[all] = 3 + (X_d[all]*(5/Y_d[all])) + 9;
-  });
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE()
+                      { Z_d[all] = 3 + (X_d[all] * (5 / Y_d[all])) + 9; });
 
   tensor_copy_to_host<policy_t>(C, C_ptr);
 
-//  for(size_t i = 0;i < N; ++ i){
-//    printf("%lf ", (double)C[i]);
-//  }
-//  printf("\n\n");
+  //  for(size_t i = 0;i < N; ++ i){
+  //    printf("%lf ", (double)C[i]);
+  //  }
+  //  printf("\n\n");
 
-  for(size_t i = 0;i < N;i ++){
-    ASSERT_SCALAR_EQ(element_t(3+(A[i]*(5/B[i]))+9), C[i]);
+  for (size_t i = 0; i < N; i++)
+  {
+    ASSERT_SCALAR_EQ(element_t(3 + (A[i] * (5 / B[i])) + 9), C[i]);
   }
 
 
   // evaluate complex left side division on all() range
-  for(size_t i = 0;i < N; ++ i){
+  for (size_t i = 0; i < N; ++i)
+  {
     C[i] = 0.0;
   }
 
   tensor_copy_to_device<policy_t>(C_ptr, C);
 
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-    Z_d[all] = 3 + ((X_d[all]*Y_d[all])/Y_d[all]) + 9;
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      { Z_d[all] = 3 + ((X_d[all] * Y_d[all]) / Y_d[all]) + 9; });
 
   tensor_copy_to_host<policy_t>(C, C_ptr);
 
-  for(size_t i = 0;i < N;i ++){
-    ASSERT_SCALAR_EQ(element_t(3+((A[i]*B[i])/B[i]))+9, C[i]);
+  for (size_t i = 0; i < N; i++)
+  {
+    ASSERT_SCALAR_EQ(element_t(3 + ((A[i] * B[i]) / B[i])) + 9, C[i]);
   }
 
   // evaluate on a subrange [N/2, N)
-  for(size_t i = 0;i < N; ++ i){
+  for (size_t i = 0; i < N; ++i)
+  {
     C[i] = 0.0;
   }
 
   tensor_copy_to_device<policy_t>(C_ptr, C);
 
   // evaluate on a subrange [N/2, N)
-  auto some = idx_t::range(N/2, N);
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-    Z_d[some] = 3.+ (X_d[some]*(5/Y_d[some])) + 9;
-  });
+  auto some = idx_t::range(N / 2, N);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE()
+                      { Z_d[some] = 3. + (X_d[some] * (5 / Y_d[some])) + 9; });
 
   tensor_copy_to_host<policy_t>(A, A_ptr);
   tensor_copy_to_host<policy_t>(B, B_ptr);
   tensor_copy_to_host<policy_t>(C, C_ptr);
 
-  for(size_t i = 0;i < N/2;i ++){
+  for (size_t i = 0; i < N / 2; i++)
+  {
     ASSERT_SCALAR_EQ(0, C[i]);
   }
-  for(size_t i = N/2;i < N;i ++){
-    ASSERT_SCALAR_EQ(element_t(3+(A[i]*(5/B[i]))+9), C[i]);
+  for (size_t i = N / 2; i < N; i++)
+  {
+    ASSERT_SCALAR_EQ(element_t(3 + (A[i] * (5 / B[i])) + 9), C[i]);
   }
 
 
-
-
   // evaluate on a subrange [0, N/2) using a forall statement
-  for(size_t i = 0;i < N; ++ i){
+  for (size_t i = 0; i < N; ++i)
+  {
     C[i] = 0.0;
   }
 
   // vector_exec only works on the host due to its use of RAJA::seq_exec
-  RAJA::forall<RAJA::expt::vector_exec<vector_t>>(RAJA::TypedRangeSegment<TX>(0,N/2),
-      [=](TX i){
+  RAJA::forall<RAJA::expt::vector_exec<vector_t>>(
+      RAJA::TypedRangeSegment<TX>(0, N / 2),
+      [=](TX i) { Z[i] = 3 + (X[i] * (5 / Y[i])) + 9; });
 
-     Z[i] = 3 + (X[i]*(5/Y[i])) + 9;
-  });
-
-  for(size_t i = 0;i < N/2;i ++){
-    ASSERT_SCALAR_EQ(element_t(3+(A[i]*(5/B[i]))+9), C[i]);
+  for (size_t i = 0; i < N / 2; i++)
+  {
+    ASSERT_SCALAR_EQ(element_t(3 + (A[i] * (5 / B[i])) + 9), C[i]);
   }
-  for(size_t i = N/2;i < N;i ++){
+  for (size_t i = N / 2; i < N; i++)
+  {
     ASSERT_SCALAR_EQ(0, C[i]);
   }
 
@@ -142,7 +146,6 @@ void ForallVectorRef1dImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorVector, ForallVectorRef1d)
 {
   ForallVectorRef1dImpl<TypeParam>();
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp
index 93596d8f23..3b1111b6ef 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp
@@ -8,38 +8,41 @@
 #ifndef __TEST_TENSOR_VECTOR_ForallVectorRef2d_HPP__
 #define __TEST_TENSOR_VECTOR_ForallVectorRef2d_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename VECTOR_TYPE>
-typename std::enable_if<TensorTestHelper<typename VECTOR_TYPE::register_policy>::is_device>::type
+typename std::enable_if<
+    TensorTestHelper<typename VECTOR_TYPE::register_policy>::is_device>::type
 ForallVectorRef2dImpl()
 {
   // do nothing for CUDA or device tests
 }
 
 template <typename VECTOR_TYPE>
-typename std::enable_if<!TensorTestHelper<typename VECTOR_TYPE::register_policy>::is_device>::type
+typename std::enable_if<
+    !TensorTestHelper<typename VECTOR_TYPE::register_policy>::is_device>::type
 ForallVectorRef2dImpl()
 {
-  using vector_t = VECTOR_TYPE;
+  using vector_t  = VECTOR_TYPE;
   using element_t = typename vector_t::element_type;
 
   using index_t = ptrdiff_t;
 
-  index_t N = 3*vector_t::s_num_elem+1;
-  index_t M = 4*vector_t::s_num_elem+1;
+  index_t N = 3 * vector_t::s_num_elem + 1;
+  index_t M = 4 * vector_t::s_num_elem + 1;
   // If we are not using fixed vectors, add some random number of elements
   // to the array to test some postamble code generation.
-  N += (size_t)(10*NO_OPT_RAND);
-  M += (size_t)(10*NO_OPT_RAND);
+  N += (size_t)(10 * NO_OPT_RAND);
+  M += (size_t)(10 * NO_OPT_RAND);
 
-  std::vector<element_t> A(N*M);
-  std::vector<element_t> B(N*M);
-  std::vector<element_t> C(N*M);
+  std::vector<element_t> A(N * M);
+  std::vector<element_t> B(N * M);
+  std::vector<element_t> C(N * M);
 
-  for(index_t i = 0;i < N*M; ++ i){
-    A[i] = (element_t)(NO_OPT_RAND*1000.0);
-    B[i] = (element_t)(NO_OPT_RAND*1000.0);
+  for (index_t i = 0; i < N * M; ++i)
+  {
+    A[i] = (element_t)(NO_OPT_RAND * 1000.0);
+    B[i] = (element_t)(NO_OPT_RAND * 1000.0);
     C[i] = 0.0;
   }
 
@@ -48,32 +51,27 @@ ForallVectorRef2dImpl()
   RAJA::View<element_t, RAJA::Layout<2>> Z(C.data(), N, M);
 
   using idx_t = RAJA::expt::VectorIndex<index_t, vector_t>;
-  auto all = idx_t::all();
+  auto all    = idx_t::all();
 
   //
   // Test with kernel, using sequential policies and ::all()
   //
-  for(index_t i = 0;i < N*M; ++ i){
+  for (index_t i = 0; i < N * M; ++i)
+  {
     C[i] = 0.0;
   }
 
-  using policy1_t =
-      RAJA::KernelPolicy<
-        RAJA::statement::For<0, RAJA::seq_exec,
-            RAJA::statement::Lambda<0>
-        >
-      >;
+  using policy1_t = RAJA::KernelPolicy<
+      RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>;
 
   // Test with kernel, using sequential policies and ::all()
   RAJA::kernel<policy1_t>(
       RAJA::make_tuple(RAJA::TypedRangeSegment<index_t>(0, N)),
-      [=] (index_t i)
-  {
-    Z(i,all) = 3+(X(i,all)*(5/Y(i,all)))+9;
-  });
+      [=](index_t i) { Z(i, all) = 3 + (X(i, all) * (5 / Y(i, all))) + 9; });
 
-  for(index_t i = 0;i < N*M;i ++){
-    ASSERT_SCALAR_EQ(3+(A[i]*(5/B[i]))+9, C[i]);
+  for (index_t i = 0; i < N * M; i++)
+  {
+    ASSERT_SCALAR_EQ(3 + (A[i] * (5 / B[i])) + 9, C[i]);
   }
 
 
@@ -81,74 +79,66 @@ ForallVectorRef2dImpl()
   // Test with kernel, using tensor_exec policy
   //
 
-  for(index_t i = 0;i < N*M; ++ i){
+  for (index_t i = 0; i < N * M; ++i)
+  {
     C[i] = 0.0;
   }
 
-  using policy2_t =
-      RAJA::KernelPolicy<
-        RAJA::statement::For<0, RAJA::seq_exec,
-          RAJA::statement::For<1, RAJA::expt::vector_exec<vector_t>,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >;
+  using policy2_t = RAJA::KernelPolicy<RAJA::statement::For<
+      0, RAJA::seq_exec,
+      RAJA::statement::For<1, RAJA::expt::vector_exec<vector_t>,
+                           RAJA::statement::Lambda<0>>>>;
 
   RAJA::kernel<policy2_t>(
       RAJA::make_tuple(RAJA::TypedRangeSegment<index_t>(0, N),
                        RAJA::TypedRangeSegment<index_t>(0, M)),
 
       [=](index_t i, index_t j)
-  {
-    Z(i, j) = 3+(X(i, j)*(5/Y(i, j)))+9;
-  });
+      { Z(i, j) = 3 + (X(i, j) * (5 / Y(i, j))) + 9; });
 
-  for(index_t i = 0;i < N*M;i ++){
-    ASSERT_SCALAR_EQ(3+(A[i]*(5/B[i]))+9, C[i]);
+  for (index_t i = 0; i < N * M; i++)
+  {
+    ASSERT_SCALAR_EQ(3 + (A[i] * (5 / B[i])) + 9, C[i]);
   }
 
 
-
   //
   // Test with forall with vectors in i
   //
-  for(index_t i = 0;i < N*M; ++ i){
+  for (index_t i = 0; i < N * M; ++i)
+  {
     C[i] = 0.0;
   }
 
-  RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<index_t>(0, M),
-      [=](index_t j){
+  RAJA::forall<RAJA::seq_exec>(
+      RAJA::TypedRangeSegment<index_t>(0, M),
+      [=](index_t j) { Z(all, j) = 3 + (X(all, j) * (5 / Y(all, j))) + 9; });
 
-    Z(all,j) = 3+(X(all,j)*(5/Y(all,j)))+9;
-
-  });
-
-  for(index_t i = 0;i < N*M;i ++){
-    ASSERT_SCALAR_EQ(3+(A[i]*(5/B[i]))+9, C[i]);
+  for (index_t i = 0; i < N * M; i++)
+  {
+    ASSERT_SCALAR_EQ(3 + (A[i] * (5 / B[i])) + 9, C[i]);
   }
 
 
   //
   // Test with forall with vectors in j
   //
-  for(index_t i = 0;i < N*M; ++ i){
+  for (index_t i = 0; i < N * M; ++i)
+  {
     C[i] = 0.0;
   }
 
-  RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<index_t>(0, N),
-      [=](index_t i){
+  RAJA::forall<RAJA::seq_exec>(
+      RAJA::TypedRangeSegment<index_t>(0, N),
+      [=](index_t i) { Z(i, all) = 3 + (X(i, all) * (5 / Y(i, all))) + 9; });
 
-    Z(i,all) = 3+(X(i,all)*(5/Y(i,all)))+9;
-
-  });
-
-  for(index_t i = 0;i < N*M;i ++){
-    ASSERT_SCALAR_EQ(3+(A[i]*(5/B[i]))+9, C[i]);
+  for (index_t i = 0; i < N * M; i++)
+  {
+    ASSERT_SCALAR_EQ(3 + (A[i] * (5 / B[i])) + 9, C[i]);
   }
 }
 
 
-
 TYPED_TEST_P(TestTensorVector, ForallVectorRef2d)
 {
   ForallVectorRef2dImpl<TypeParam>();
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-MinMax.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-MinMax.hpp
index 4841c4e7ee..42b13bb70c 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-MinMax.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-MinMax.hpp
@@ -8,25 +8,26 @@
 #ifndef __TEST_TENSOR_VECTOR_MinMax_HPP__
 #define __TEST_TENSOR_VECTOR_MinMax_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename VECTOR_TYPE>
 void MinMaxImpl()
 {
 
-  using vector_t = VECTOR_TYPE;
-  using policy_t = typename vector_t::register_policy;
+  using vector_t  = VECTOR_TYPE;
+  using policy_t  = typename vector_t::register_policy;
   using element_t = typename vector_t::element_type;
 
   std::vector<element_t> A(vector_t::s_num_elem);
   std::vector<element_t> ex_min(1);
   std::vector<element_t> ex_max(1);
 
-  element_t * A_ptr = tensor_malloc<policy_t>(A);
-  element_t * ex_min_ptr = tensor_malloc<policy_t>(ex_min);
-  element_t * ex_max_ptr = tensor_malloc<policy_t>(ex_max);
+  element_t* A_ptr      = tensor_malloc<policy_t>(A);
+  element_t* ex_min_ptr = tensor_malloc<policy_t>(ex_min);
+  element_t* ex_max_ptr = tensor_malloc<policy_t>(ex_max);
 
-  for(camp::idx_t i = 0;i < vector_t::s_num_elem;++ i){
+  for (camp::idx_t i = 0; i < vector_t::s_num_elem; ++i)
+  {
     A[i] = (element_t)i;
   }
   ex_min[0] = (element_t)99999999;
@@ -39,17 +40,20 @@ void MinMaxImpl()
 
   // For Fixed vectors, only try with fixed length
   // For Stream vectors, try all lengths
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-    for(camp::idx_t N = 1; N <= vector_t::s_num_elem; ++ N){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        for (camp::idx_t N = 1; N <= vector_t::s_num_elem; ++N)
+        {
 
-      // load array A as vector
-      vector_t vec;
-      vec.load_packed_n(A_ptr, N);
+          // load array A as vector
+          vector_t vec;
+          vec.load_packed_n(A_ptr, N);
 
-      ex_min_ptr[0] = vec.min_n(N);
-      ex_max_ptr[0] = vec.max_n(N);
-    }
-  });
+          ex_min_ptr[0] = vec.min_n(N);
+          ex_max_ptr[0] = vec.max_n(N);
+        }
+      });
 
   tensor_copy_to_host<policy_t>(ex_min, ex_min_ptr);
   tensor_copy_to_host<policy_t>(ex_max, ex_max_ptr);
@@ -58,7 +62,7 @@ void MinMaxImpl()
   ASSERT_SCALAR_EQ(ex_min[0], (element_t)0);
 
   // check max
-  ASSERT_SCALAR_EQ(ex_max[0], (element_t)(vector_t::s_num_elem-1));
+  ASSERT_SCALAR_EQ(ex_max[0], (element_t)(vector_t::s_num_elem - 1));
 
   tensor_free<policy_t>(A_ptr);
   tensor_free<policy_t>(ex_min_ptr);
@@ -66,11 +70,7 @@ void MinMaxImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorVector, MinMax)
-{
-  MinMaxImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorVector, MinMax) { MinMaxImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-SumDot.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-SumDot.hpp
index fa3a1caef8..5138d7858a 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-SumDot.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-SumDot.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_VECTOR_SumDot_HPP__
 #define __TEST_TENSOR_VECTOR_SumDot_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename VECTOR_TYPE>
 void SumDotImpl()
 {
 
-  using vector_t = VECTOR_TYPE;
-  using policy_t = typename vector_t::register_policy;
+  using vector_t  = VECTOR_TYPE;
+  using policy_t  = typename vector_t::register_policy;
   using element_t = typename vector_t::element_type;
 
   std::vector<element_t> A(vector_t::s_num_elem);
@@ -25,11 +25,12 @@ void SumDotImpl()
   element_t host_sum = 0;
   element_t host_dot = 0;
 
-  element_t * A_ptr = tensor_malloc<policy_t>(A);
-  element_t * ex_sum_ptr = tensor_malloc<policy_t>(ex_sum);
-  element_t * ex_dot_ptr = tensor_malloc<policy_t>(ex_dot);
+  element_t* A_ptr      = tensor_malloc<policy_t>(A);
+  element_t* ex_sum_ptr = tensor_malloc<policy_t>(ex_sum);
+  element_t* ex_dot_ptr = tensor_malloc<policy_t>(ex_dot);
 
-  for(camp::idx_t i = 0;i < vector_t::s_num_elem;++ i){
+  for (camp::idx_t i = 0; i < vector_t::s_num_elem; ++i)
+  {
     A[i] = (element_t)i;
   }
 
@@ -37,9 +38,10 @@ void SumDotImpl()
   ex_dot[0] = (element_t)0;
 
   // compute expected values on host
-  for(camp::idx_t i = 0; i < vector_t::s_num_elem; ++i){
+  for (camp::idx_t i = 0; i < vector_t::s_num_elem; ++i)
+  {
     host_sum += A[i];
-    host_dot += A[i]*A[i];
+    host_dot += A[i] * A[i];
   }
 
   tensor_copy_to_device<policy_t>(A_ptr, A);
@@ -48,14 +50,16 @@ void SumDotImpl()
 
   // For Fixed vectors, only try with fixed length
   // For Stream vectors, try all lengths
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-    // load array A as vector
-    vector_t vec;
-    vec.load_packed_n(A_ptr, vector_t::s_num_elem);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // load array A as vector
+        vector_t vec;
+        vec.load_packed_n(A_ptr, vector_t::s_num_elem);
 
-    ex_sum_ptr[0] = vec.sum();
-    ex_dot_ptr[0] = vec.dot(vec);
-  });
+        ex_sum_ptr[0] = vec.sum();
+        ex_dot_ptr[0] = vec.dot(vec);
+      });
 
   tensor_copy_to_host<policy_t>(ex_sum, ex_sum_ptr);
   tensor_copy_to_host<policy_t>(ex_dot, ex_dot_ptr);
@@ -72,11 +76,7 @@ void SumDotImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorVector, SumDot)
-{
-  SumDotImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorVector, SumDot) { SumDotImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/util/test-CombiningAdapter-1D.cpp b/test/functional/util/test-CombiningAdapter-1D.cpp
index 4dc73781d5..2867cd56e2 100644
--- a/test/functional/util/test-CombiningAdapter-1D.cpp
+++ b/test/functional/util/test-CombiningAdapter-1D.cpp
@@ -19,17 +19,22 @@
 #include <numeric>
 #include <vector>
 
-template < typename SegIndexType, typename Segment0 >
+template <typename SegIndexType, typename Segment0>
 void test_CombiningAdapter_1D(Segment0 const& seg0)
 {
-  using std::begin; using std::end; using std::distance;
+  using std::begin;
+  using std::distance;
+  using std::end;
   auto seg0_begin = begin(seg0);
 
   size_t counter0 = 0;
-  auto adapter = RAJA::make_CombiningAdapter([&](SegIndexType i0) {
-    ASSERT_EQ(seg0_begin[counter0], i0);
-    counter0 += 1;
-  }, seg0);
+  auto adapter    = RAJA::make_CombiningAdapter(
+      [&](SegIndexType i0)
+      {
+        ASSERT_EQ(seg0_begin[counter0], i0);
+        counter0 += 1;
+      },
+      seg0);
 
   ASSERT_EQ(adapter.size(), seg0.size());
 
@@ -38,12 +43,13 @@ void test_CombiningAdapter_1D(Segment0 const& seg0)
   ASSERT_EQ(distance(begin(range), end(range)), seg0.size());
 
   auto range_end = end(range);
-  for (auto idx = begin(range); idx != range_end; ++idx) {
+  for (auto idx = begin(range); idx != range_end; ++idx)
+  {
     adapter(*idx);
   }
 }
 
-template < typename SegIndexType >
+template <typename SegIndexType>
 void test_types_CombiningAdapter_1D(SegIndexType ibegin0, SegIndexType iend0)
 {
   RAJA::TypedRangeSegment<SegIndexType> rseg0(ibegin0, iend0);
diff --git a/test/functional/util/test-CombiningAdapter-2D.cpp b/test/functional/util/test-CombiningAdapter-2D.cpp
index bfb7355418..6a01da4836 100644
--- a/test/functional/util/test-CombiningAdapter-2D.cpp
+++ b/test/functional/util/test-CombiningAdapter-2D.cpp
@@ -19,42 +19,53 @@
 #include <numeric>
 #include <vector>
 
-template < typename SegIndexType0, typename SegIndexType1,
-           typename Segment0, typename Segment1 >
+template <typename SegIndexType0,
+          typename SegIndexType1,
+          typename Segment0,
+          typename Segment1>
 void test_CombiningAdapter_2D(Segment0 const& seg0, Segment1 const& seg1)
 {
-  using std::begin; using std::end; using std::distance;
+  using std::begin;
+  using std::distance;
+  using std::end;
   auto seg0_begin = begin(seg0);
   auto seg1_begin = begin(seg1);
   size_t seg1_len = static_cast<size_t>(seg1.size());
 
   size_t counter0 = 0;
   size_t counter1 = 0;
-  auto adapter = RAJA::make_CombiningAdapter([&](SegIndexType0 i0, SegIndexType1 i1) {
-    ASSERT_EQ(seg0_begin[counter0], i0);
-    ASSERT_EQ(seg1_begin[counter1], i1);
-    counter1 += 1;
-    if (counter1 == seg1_len) {
-      counter1 = 0;
-      counter0 += 1;
-    }
-  }, seg0, seg1);
+  auto adapter    = RAJA::make_CombiningAdapter(
+      [&](SegIndexType0 i0, SegIndexType1 i1)
+      {
+        ASSERT_EQ(seg0_begin[counter0], i0);
+        ASSERT_EQ(seg1_begin[counter1], i1);
+        counter1 += 1;
+        if (counter1 == seg1_len)
+        {
+          counter1 = 0;
+          counter0 += 1;
+        }
+      },
+      seg0, seg1);
 
-  ASSERT_EQ(adapter.size(), seg0.size()*seg1.size());
+  ASSERT_EQ(adapter.size(), seg0.size() * seg1.size());
 
   auto range = adapter.getRange();
 
-  ASSERT_EQ(distance(begin(range), end(range)), seg0.size()*seg1.size());
+  ASSERT_EQ(distance(begin(range), end(range)), seg0.size() * seg1.size());
 
   auto range_end = end(range);
-  for (auto idx = begin(range); idx != range_end; ++idx) {
+  for (auto idx = begin(range); idx != range_end; ++idx)
+  {
     adapter(*idx);
   }
 }
 
-template < typename SegIndexType0, typename SegIndexType1 >
-void test_types_CombiningAdapter_2D(SegIndexType0 ibegin0, SegIndexType0 iend0,
-                                    SegIndexType1 ibegin1, SegIndexType1 iend1)
+template <typename SegIndexType0, typename SegIndexType1>
+void test_types_CombiningAdapter_2D(SegIndexType0 ibegin0,
+                                    SegIndexType0 iend0,
+                                    SegIndexType1 ibegin1,
+                                    SegIndexType1 iend1)
 {
   RAJA::TypedRangeSegment<SegIndexType0> rseg0(ibegin0, iend0);
   RAJA::TypedRangeSegment<SegIndexType1> rseg1(ibegin1, iend1);
diff --git a/test/functional/util/test-CombiningAdapter-3D.cpp b/test/functional/util/test-CombiningAdapter-3D.cpp
index 9181b974b9..38226bdf4a 100644
--- a/test/functional/util/test-CombiningAdapter-3D.cpp
+++ b/test/functional/util/test-CombiningAdapter-3D.cpp
@@ -19,11 +19,19 @@
 #include <numeric>
 #include <vector>
 
-template < typename SegIndexType0, typename SegIndexType1, typename SegIndexType2,
-           typename Segment0, typename Segment1, typename Segment2 >
-void test_CombiningAdapter_3D(Segment0 const& seg0, Segment1 const& seg1, Segment2 const& seg2)
+template <typename SegIndexType0,
+          typename SegIndexType1,
+          typename SegIndexType2,
+          typename Segment0,
+          typename Segment1,
+          typename Segment2>
+void test_CombiningAdapter_3D(Segment0 const& seg0,
+                              Segment1 const& seg1,
+                              Segment2 const& seg2)
 {
-  using std::begin; using std::end; using std::distance;
+  using std::begin;
+  using std::distance;
+  using std::end;
   auto seg0_begin = begin(seg0);
   auto seg1_begin = begin(seg1);
   size_t seg1_len = static_cast<size_t>(seg1.size());
@@ -33,42 +41,55 @@ void test_CombiningAdapter_3D(Segment0 const& seg0, Segment1 const& seg1, Segmen
   size_t counter0 = 0;
   size_t counter1 = 0;
   size_t counter2 = 0;
-  auto adapter = RAJA::make_CombiningAdapter([&](SegIndexType0 i0, SegIndexType1 i1, SegIndexType2 i2) {
-    ASSERT_EQ(seg0_begin[counter0], i0);
-    ASSERT_EQ(seg1_begin[counter1], i1);
-    ASSERT_EQ(seg2_begin[counter2], i2);
-    counter2 += 1;
-    if (counter2 == seg2_len) {
-      counter2 = 0;
-      counter1 += 1;
-      if (counter1 == seg1_len) {
-        counter1 = 0;
-        counter0 += 1;
-      }
-    }
-  }, seg0, seg1, seg2);
+  auto adapter    = RAJA::make_CombiningAdapter(
+      [&](SegIndexType0 i0, SegIndexType1 i1, SegIndexType2 i2)
+      {
+        ASSERT_EQ(seg0_begin[counter0], i0);
+        ASSERT_EQ(seg1_begin[counter1], i1);
+        ASSERT_EQ(seg2_begin[counter2], i2);
+        counter2 += 1;
+        if (counter2 == seg2_len)
+        {
+          counter2 = 0;
+          counter1 += 1;
+          if (counter1 == seg1_len)
+          {
+            counter1 = 0;
+            counter0 += 1;
+          }
+        }
+      },
+      seg0, seg1, seg2);
 
-  ASSERT_EQ(adapter.size(), seg0.size()*seg1.size()*seg2.size());
+  ASSERT_EQ(adapter.size(), seg0.size() * seg1.size() * seg2.size());
 
   auto range = adapter.getRange();
 
-  ASSERT_EQ(distance(begin(range), end(range)), seg0.size()*seg1.size()*seg2.size());
+  ASSERT_EQ(distance(begin(range), end(range)),
+            seg0.size() * seg1.size() * seg2.size());
 
   auto range_end = end(range);
-  for (auto idx = begin(range); idx != range_end; ++idx) {
+  for (auto idx = begin(range); idx != range_end; ++idx)
+  {
     adapter(*idx);
   }
 }
 
-template < typename SegIndexType0, typename SegIndexType1, typename SegIndexType2 >
-void test_types_CombiningAdapter_3D(SegIndexType0 ibegin0, SegIndexType0 iend0,
-                                    SegIndexType1 ibegin1, SegIndexType1 iend1,
-                                    SegIndexType2 ibegin2, SegIndexType2 iend2)
+template <typename SegIndexType0,
+          typename SegIndexType1,
+          typename SegIndexType2>
+void test_types_CombiningAdapter_3D(SegIndexType0 ibegin0,
+                                    SegIndexType0 iend0,
+                                    SegIndexType1 ibegin1,
+                                    SegIndexType1 iend1,
+                                    SegIndexType2 ibegin2,
+                                    SegIndexType2 iend2)
 {
   RAJA::TypedRangeSegment<SegIndexType0> rseg0(ibegin0, iend0);
   RAJA::TypedRangeSegment<SegIndexType1> rseg1(ibegin1, iend1);
   RAJA::TypedRangeSegment<SegIndexType2> rseg2(ibegin2, iend2);
-  test_CombiningAdapter_3D<SegIndexType0, SegIndexType1, SegIndexType2>(rseg0, rseg1, rseg2);
+  test_CombiningAdapter_3D<SegIndexType0, SegIndexType1, SegIndexType2>(
+      rseg0, rseg1, rseg2);
 }
 
 TEST(CombiningAdapter, test3D)
diff --git a/test/functional/util/test-PermutedCombiningAdapter-1D.cpp b/test/functional/util/test-PermutedCombiningAdapter-1D.cpp
index ddcaea52d7..f7d489c75e 100644
--- a/test/functional/util/test-PermutedCombiningAdapter-1D.cpp
+++ b/test/functional/util/test-PermutedCombiningAdapter-1D.cpp
@@ -19,17 +19,22 @@
 #include <numeric>
 #include <vector>
 
-template < typename Perm, typename IndexType, typename Segment >
+template <typename Perm, typename IndexType, typename Segment>
 void test_PermutedCombiningAdapter_1D(Segment const& seg0)
 {
-  using std::begin; using std::end; using std::distance;
+  using std::begin;
+  using std::distance;
+  using std::end;
   auto seg0_begin = begin(seg0);
 
   size_t counters[1] = {0};
-  auto adapter = RAJA::make_PermutedCombiningAdapter<Perm>([&](IndexType i0) {
-    ASSERT_EQ(seg0_begin[counters[0]], i0);
-    counters[camp::seq_at<0, Perm>::value] += 1;
-  }, seg0);
+  auto adapter       = RAJA::make_PermutedCombiningAdapter<Perm>(
+      [&](IndexType i0)
+      {
+        ASSERT_EQ(seg0_begin[counters[0]], i0);
+        counters[camp::seq_at<0, Perm>::value] += 1;
+      },
+      seg0);
 
   ASSERT_EQ(adapter.size(), seg0.size());
 
@@ -38,12 +43,13 @@ void test_PermutedCombiningAdapter_1D(Segment const& seg0)
   ASSERT_EQ(distance(begin(range), end(range)), seg0.size());
 
   auto range_end = end(range);
-  for (auto idx = begin(range); idx != range_end; ++idx) {
+  for (auto idx = begin(range); idx != range_end; ++idx)
+  {
     adapter(*idx);
   }
 }
 
-template < typename Perm, typename IndexType >
+template <typename Perm, typename IndexType>
 void test_types_PermutedCombiningAdapter_1D(IndexType ibegin0, IndexType iend0)
 {
   RAJA::TypedRangeSegment<IndexType> rseg0(ibegin0, iend0);
diff --git a/test/functional/util/test-PermutedCombiningAdapter-2D.cpp b/test/functional/util/test-PermutedCombiningAdapter-2D.cpp
index fd1f6a8b0a..d9396d6ebd 100644
--- a/test/functional/util/test-PermutedCombiningAdapter-2D.cpp
+++ b/test/functional/util/test-PermutedCombiningAdapter-2D.cpp
@@ -19,41 +19,51 @@
 #include <numeric>
 #include <vector>
 
-template < typename Perm, typename IndexType, typename Segment >
+template <typename Perm, typename IndexType, typename Segment>
 void test_PermutedCombiningAdapter_2D(Segment const& seg0, Segment const& seg1)
 {
-  using std::begin; using std::end; using std::distance;
-  auto seg0_begin = begin(seg0);
-  auto seg1_begin = begin(seg1);
+  using std::begin;
+  using std::distance;
+  using std::end;
+  auto seg0_begin    = begin(seg0);
+  auto seg1_begin    = begin(seg1);
   size_t seg_lens[2] = {static_cast<size_t>(seg0.size()),
                         static_cast<size_t>(seg1.size())};
 
   size_t counters[2] = {0, 0};
-  auto adapter = RAJA::make_PermutedCombiningAdapter<Perm>([&](IndexType i0, IndexType i1) {
-    ASSERT_EQ(seg0_begin[counters[0]], i0);
-    ASSERT_EQ(seg1_begin[counters[1]], i1);
-    counters[camp::seq_at<1, Perm>::value] += 1;
-    if (counters[camp::seq_at<1, Perm>::value] == seg_lens[camp::seq_at<1, Perm>::value]) {
-      counters[camp::seq_at<1, Perm>::value] = 0;
-      counters[camp::seq_at<0, Perm>::value] += 1;
-    }
-  }, seg0, seg1);
+  auto adapter       = RAJA::make_PermutedCombiningAdapter<Perm>(
+      [&](IndexType i0, IndexType i1)
+      {
+        ASSERT_EQ(seg0_begin[counters[0]], i0);
+        ASSERT_EQ(seg1_begin[counters[1]], i1);
+        counters[camp::seq_at<1, Perm>::value] += 1;
+        if (counters[camp::seq_at<1, Perm>::value] ==
+            seg_lens[camp::seq_at<1, Perm>::value])
+        {
+          counters[camp::seq_at<1, Perm>::value] = 0;
+          counters[camp::seq_at<0, Perm>::value] += 1;
+        }
+      },
+      seg0, seg1);
 
-  ASSERT_EQ(adapter.size(), seg0.size()*seg1.size());
+  ASSERT_EQ(adapter.size(), seg0.size() * seg1.size());
 
   auto range = adapter.getRange();
 
-  ASSERT_EQ(distance(begin(range), end(range)), seg0.size()*seg1.size());
+  ASSERT_EQ(distance(begin(range), end(range)), seg0.size() * seg1.size());
 
   auto range_end = end(range);
-  for (auto idx = begin(range); idx != range_end; ++idx) {
+  for (auto idx = begin(range); idx != range_end; ++idx)
+  {
     adapter(*idx);
   }
 }
 
-template < typename Perm, typename IndexType >
-void test_types_PermutedCombiningAdapter_2D(IndexType ibegin0, IndexType iend0,
-                                            IndexType ibegin1, IndexType iend1)
+template <typename Perm, typename IndexType>
+void test_types_PermutedCombiningAdapter_2D(IndexType ibegin0,
+                                            IndexType iend0,
+                                            IndexType ibegin1,
+                                            IndexType iend1)
 {
   RAJA::TypedRangeSegment<IndexType> rseg0(ibegin0, iend0);
   RAJA::TypedRangeSegment<IndexType> rseg1(ibegin1, iend1);
diff --git a/test/functional/util/test-PermutedCombiningAdapter-3D.cpp b/test/functional/util/test-PermutedCombiningAdapter-3D.cpp
index 0943584c97..2ef1021251 100644
--- a/test/functional/util/test-PermutedCombiningAdapter-3D.cpp
+++ b/test/functional/util/test-PermutedCombiningAdapter-3D.cpp
@@ -19,49 +19,65 @@
 #include <numeric>
 #include <vector>
 
-template < typename Perm, typename IndexType, typename Segment >
-void test_PermutedCombiningAdapter_3D(Segment const& seg0, Segment const& seg1, Segment const& seg2)
+template <typename Perm, typename IndexType, typename Segment>
+void test_PermutedCombiningAdapter_3D(Segment const& seg0,
+                                      Segment const& seg1,
+                                      Segment const& seg2)
 {
-  using std::begin; using std::end; using std::distance;
-  auto seg0_begin = begin(seg0);
-  auto seg1_begin = begin(seg1);
-  auto seg2_begin = begin(seg2);
+  using std::begin;
+  using std::distance;
+  using std::end;
+  auto seg0_begin    = begin(seg0);
+  auto seg1_begin    = begin(seg1);
+  auto seg2_begin    = begin(seg2);
   size_t seg_lens[3] = {static_cast<size_t>(seg0.size()),
                         static_cast<size_t>(seg1.size()),
                         static_cast<size_t>(seg2.size())};
 
   size_t counters[3] = {0, 0, 0};
-  auto adapter = RAJA::make_PermutedCombiningAdapter<Perm>([&](IndexType i0, IndexType i1, IndexType i2) {
-    ASSERT_EQ(seg0_begin[counters[0]], i0);
-    ASSERT_EQ(seg1_begin[counters[1]], i1);
-    ASSERT_EQ(seg2_begin[counters[2]], i2);
-    counters[camp::seq_at<2, Perm>::value] += 1;
-    if (counters[camp::seq_at<2, Perm>::value] == seg_lens[camp::seq_at<2, Perm>::value]) {
-      counters[camp::seq_at<2, Perm>::value] = 0;
-      counters[camp::seq_at<1, Perm>::value] += 1;
-      if (counters[camp::seq_at<1, Perm>::value] == seg_lens[camp::seq_at<1, Perm>::value]) {
-        counters[camp::seq_at<1, Perm>::value] = 0;
-        counters[camp::seq_at<0, Perm>::value] += 1;
-      }
-    }
-  }, seg0, seg1, seg2);
+  auto adapter       = RAJA::make_PermutedCombiningAdapter<Perm>(
+      [&](IndexType i0, IndexType i1, IndexType i2)
+      {
+        ASSERT_EQ(seg0_begin[counters[0]], i0);
+        ASSERT_EQ(seg1_begin[counters[1]], i1);
+        ASSERT_EQ(seg2_begin[counters[2]], i2);
+        counters[camp::seq_at<2, Perm>::value] += 1;
+        if (counters[camp::seq_at<2, Perm>::value] ==
+            seg_lens[camp::seq_at<2, Perm>::value])
+        {
+          counters[camp::seq_at<2, Perm>::value] = 0;
+          counters[camp::seq_at<1, Perm>::value] += 1;
+          if (counters[camp::seq_at<1, Perm>::value] ==
+              seg_lens[camp::seq_at<1, Perm>::value])
+          {
+            counters[camp::seq_at<1, Perm>::value] = 0;
+            counters[camp::seq_at<0, Perm>::value] += 1;
+          }
+        }
+      },
+      seg0, seg1, seg2);
 
-  ASSERT_EQ(adapter.size(), seg0.size()*seg1.size()*seg2.size());
+  ASSERT_EQ(adapter.size(), seg0.size() * seg1.size() * seg2.size());
 
   auto range = adapter.getRange();
 
-  ASSERT_EQ(distance(begin(range), end(range)), seg0.size()*seg1.size()*seg2.size());
+  ASSERT_EQ(distance(begin(range), end(range)),
+            seg0.size() * seg1.size() * seg2.size());
 
   auto range_end = end(range);
-  for (auto idx = begin(range); idx != range_end; ++idx) {
+  for (auto idx = begin(range); idx != range_end; ++idx)
+  {
     adapter(*idx);
   }
 }
 
-template < typename Perm, typename IndexType >
-void test_types_PermutedCombiningAdapter_3D(IndexType ibegin0, IndexType iend0,
-                                            IndexType ibegin1, IndexType iend1,
-                                            IndexType ibegin2, IndexType iend2)
+template <typename Perm, typename IndexType>
+void test_types_PermutedCombiningAdapter_3D(IndexType ibegin0,
+                                            IndexType iend0,
+                                            IndexType ibegin1,
+                                            IndexType iend1,
+                                            IndexType ibegin2,
+                                            IndexType iend2)
 {
   RAJA::TypedRangeSegment<IndexType> rseg0(ibegin0, iend0);
   RAJA::TypedRangeSegment<IndexType> rseg1(ibegin1, iend1);
@@ -77,7 +93,10 @@ TEST(PermutedCombiningAdapter, test3D)
   test_types_PermutedCombiningAdapter_3D<RAJA::PERM_JKI, int>(0, 0, 0, 0, 0, 5);
 
   test_types_PermutedCombiningAdapter_3D<RAJA::PERM_KIJ, int>(0, 3, 0, 4, 0, 5);
-  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_KJI, long>(-3, 5, 0, 6, 2, 5);
-  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_IJK, long>(4, 13, -2, 7, -3, 0);
-  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_IKJ, long>(-8, -2, -5, 3, 1, 4);
+  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_KJI, long>(-3, 5, 0, 6, 2,
+                                                               5);
+  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_IJK, long>(4, 13, -2, 7, -3,
+                                                               0);
+  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_IKJ, long>(-8, -2, -5, 3, 1,
+                                                               4);
 }
diff --git a/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp b/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp
index 4241a945dd..24dc62646b 100644
--- a/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp
@@ -20,50 +20,62 @@
 
 
 // These are defined here due to cuda limitations
-template < typename IndexType, typename type1 >
-struct callable11 {
+template <typename IndexType, typename type1>
+struct callable11
+{
   type1* working_ptr1;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+  RAJA_HOST_DEVICE void operator()(IndexType i) const
+  {
     working_ptr1[i] += type1(i);
   }
 };
-template < typename IndexType, typename type1 >
-struct callable12 {
+template <typename IndexType, typename type1>
+struct callable12
+{
   type1* working_ptr1;
   type1 const test_val1;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+  RAJA_HOST_DEVICE void operator()(IndexType i) const
+  {
     working_ptr1[i] += test_val1;
   }
 };
 
-template < typename IndexType, typename type2 >
-struct callable21 {
+template <typename IndexType, typename type2>
+struct callable21
+{
   type2* working_ptr2;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+  RAJA_HOST_DEVICE void operator()(IndexType i) const
+  {
     working_ptr2[i] += type2(i);
   }
 };
-template < typename IndexType, typename type2 >
-struct callable22 {
+template <typename IndexType, typename type2>
+struct callable22
+{
   type2* working_ptr2;
   type2 const test_val2;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+  RAJA_HOST_DEVICE void operator()(IndexType i) const
+  {
     working_ptr2[i] += test_val2;
   }
 };
 
-template < typename IndexType, typename type3 >
-struct callable31 {
+template <typename IndexType, typename type3>
+struct callable31
+{
   type3* working_ptr3;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+  RAJA_HOST_DEVICE void operator()(IndexType i) const
+  {
     working_ptr3[i] += type3(i);
   }
 };
-template < typename IndexType, typename type3 >
-struct callable32 {
+template <typename IndexType, typename type3>
+struct callable32
+{
   type3* working_ptr3;
   type3 const test_val3;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+  RAJA_HOST_DEVICE void operator()(IndexType i) const
+  {
     working_ptr3[i] += test_val3;
   }
 };
@@ -75,356 +87,382 @@ template <typename ExecPolicy,
           typename DispatchTyper,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupOrderedMultiple {
-void operator()(
-    std::mt19937& rng, IndexType max_begin, IndexType min_end,
-    IndexType num1, IndexType num2, IndexType num3,
-    IndexType pool_reuse, IndexType group_reuse) const
+          typename WORKING_RES>
+struct testWorkGroupOrderedMultiple
 {
-  ASSERT_GT(min_end, max_begin);
-  IndexType N = min_end + max_begin;
+  void operator()(std::mt19937& rng,
+                  IndexType max_begin,
+                  IndexType min_end,
+                  IndexType num1,
+                  IndexType num2,
+                  IndexType num3,
+                  IndexType pool_reuse,
+                  IndexType group_reuse) const
+  {
+    ASSERT_GT(min_end, max_begin);
+    IndexType N = min_end + max_begin;
 
-  std::vector<IndexType> begin1, end1;
-  std::vector<IndexType> begin2, end2;
-  std::vector<IndexType> begin3, end3;
+    std::vector<IndexType> begin1, end1;
+    std::vector<IndexType> begin2, end2;
+    std::vector<IndexType> begin3, end3;
 
-  {
-    using dist_type = std::uniform_int_distribution<IndexType>;
+    {
+      using dist_type = std::uniform_int_distribution<IndexType>;
 
-    for (IndexType j = IndexType(0); j < num1; j++) {
-      begin1.push_back(dist_type(max_begin, min_end-1)(rng));
-      end1.push_back(dist_type(begin1.back(), min_end)(rng));
-    }
+      for (IndexType j = IndexType(0); j < num1; j++)
+      {
+        begin1.push_back(dist_type(max_begin, min_end - 1)(rng));
+        end1.push_back(dist_type(begin1.back(), min_end)(rng));
+      }
 
-    for (IndexType j = IndexType(0); j < num2; j++) {
-      begin2.push_back(dist_type(max_begin, min_end-1)(rng));
-      end2.push_back(dist_type(begin2.back(), min_end)(rng));
-    }
+      for (IndexType j = IndexType(0); j < num2; j++)
+      {
+        begin2.push_back(dist_type(max_begin, min_end - 1)(rng));
+        end2.push_back(dist_type(begin2.back(), min_end)(rng));
+      }
 
-    for (IndexType j = IndexType(0); j < num3; j++) {
-      begin3.push_back(dist_type(max_begin, min_end-1)(rng));
-      end3.push_back(dist_type(begin3.back(), min_end)(rng));
+      for (IndexType j = IndexType(0); j < num3; j++)
+      {
+        begin3.push_back(dist_type(max_begin, min_end - 1)(rng));
+        end3.push_back(dist_type(begin3.back(), min_end)(rng));
+      }
     }
-  }
 
-  WORKING_RES res = WORKING_RES::get_default();
-  camp::resources::Resource working_res{res};
+    WORKING_RES res = WORKING_RES::get_default();
+    camp::resources::Resource working_res {res};
 
-  using type1 = IndexType;
-  using type2 = size_t;
-  using type3 = double;
+    using type1 = IndexType;
+    using type2 = size_t;
+    using type3 = double;
 
-  type1* working_array1 = nullptr;
-  type1* check_array1 = nullptr;
-  type1* test_array1 = nullptr;
+    type1* working_array1 = nullptr;
+    type1* check_array1   = nullptr;
+    type1* test_array1    = nullptr;
 
-  type2* working_array2 = nullptr;
-  type2* check_array2 = nullptr;
-  type2* test_array2 = nullptr;
+    type2* working_array2 = nullptr;
+    type2* check_array2   = nullptr;
+    type2* test_array2    = nullptr;
 
-  type3* working_array3 = nullptr;
-  type3* check_array3 = nullptr;
-  type3* test_array3 = nullptr;
+    type3* working_array3 = nullptr;
+    type3* check_array3   = nullptr;
+    type3* test_array3    = nullptr;
 
-  allocateForallTestData<type1>(N * num1,
-                                working_res,
-                                &working_array1,
-                                &check_array1,
-                                &test_array1);
+    allocateForallTestData<type1>(N * num1, working_res, &working_array1,
+                                  &check_array1, &test_array1);
 
-  allocateForallTestData<type2>(N * num2,
-                                working_res,
-                                &working_array2,
-                                &check_array2,
-                                &test_array2);
+    allocateForallTestData<type2>(N * num2, working_res, &working_array2,
+                                  &check_array2, &test_array2);
 
-  allocateForallTestData<type3>(N * num3,
-                                working_res,
-                                &working_array3,
-                                &check_array3,
-                                &test_array3);
+    allocateForallTestData<type3>(N * num3, working_res, &working_array3,
+                                  &check_array3, &test_array3);
 
-  type1 const test_val1(5);
-  type2 const test_val2(7);
-  type3 const test_val3(11);
+    type1 const test_val1(5);
+    type2 const test_val2(7);
+    type3 const test_val3(11);
 
-  using range_segment = RAJA::TypedRangeSegment<IndexType>;
+    using range_segment = RAJA::TypedRangeSegment<IndexType>;
 
 
-  using DispatchPolicy = typename DispatchTyper::template type<
-      camp::list<range_segment, callable11<IndexType, type1>>,
-      camp::list<range_segment, callable12<IndexType, type1>>,
-      camp::list<range_segment, callable21<IndexType, type2>>,
-      camp::list<range_segment, callable22<IndexType, type2>>,
-      camp::list<range_segment, callable31<IndexType, type3>>,
-      camp::list<range_segment, callable32<IndexType, type3>> >;
+    using DispatchPolicy = typename DispatchTyper::template type<
+        camp::list<range_segment, callable11<IndexType, type1>>,
+        camp::list<range_segment, callable12<IndexType, type1>>,
+        camp::list<range_segment, callable21<IndexType, type2>>,
+        camp::list<range_segment, callable22<IndexType, type2>>,
+        camp::list<range_segment, callable31<IndexType, type3>>,
+        camp::list<range_segment, callable32<IndexType, type3>>>;
 
-  using WorkPool_type = RAJA::WorkPool<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
+    using WorkPool_type =
+        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
-  using WorkGroup_type = RAJA::WorkGroup<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
+    using WorkGroup_type =
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                              StoragePolicy, DispatchPolicy>,
+                        IndexType, RAJA::xargs<>, Allocator>;
 
-  using WorkSite_type = RAJA::WorkSite<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
+    using WorkSite_type =
+        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
-  using resource_type = typename WorkGroup_type::resource_type;
+    using resource_type = typename WorkGroup_type::resource_type;
 
-  WorkPool_type pool(Allocator{});
-  WorkGroup_type group = pool.instantiate();
-  WorkSite_type site = group.run();
+    WorkPool_type pool(Allocator {});
+    WorkGroup_type group = pool.instantiate();
+    WorkSite_type site   = group.run();
 
-  for (IndexType pr = 0; pr < pool_reuse; pr++) {
+    for (IndexType pr = 0; pr < pool_reuse; pr++)
+    {
 
 
-    // fill_pool(pool, type1(5), type2(7), type3(11));
-    {
-      for (IndexType j = IndexType(0); j < num1; j++) {
-        type1* working_ptr1 = working_array1 + N * j;
-        pool.enqueue(range_segment{ begin1[j], end1[j] },
-            callable11<IndexType, type1>{working_ptr1});
-        pool.enqueue(range_segment{ begin1[j], end1[j] },
-            callable12<IndexType, type1>{working_ptr1, test_val1});
-      }
+      // fill_pool(pool, type1(5), type2(7), type3(11));
+      {
+        for (IndexType j = IndexType(0); j < num1; j++)
+        {
+          type1* working_ptr1 = working_array1 + N * j;
+          pool.enqueue(range_segment {begin1[j], end1[j]},
+                       callable11<IndexType, type1> {working_ptr1});
+          pool.enqueue(range_segment {begin1[j], end1[j]},
+                       callable12<IndexType, type1> {working_ptr1, test_val1});
+        }
 
-      for (IndexType j = IndexType(0); j < num2; j++) {
-        type2* working_ptr2 = working_array2 + N * j;
-        pool.enqueue(range_segment{ begin2[j], end2[j] },
-            callable21<IndexType, type2>{working_ptr2});
-        pool.enqueue(range_segment{ begin2[j], end2[j] },
-            callable22<IndexType, type2>{working_ptr2, test_val2});
-      }
+        for (IndexType j = IndexType(0); j < num2; j++)
+        {
+          type2* working_ptr2 = working_array2 + N * j;
+          pool.enqueue(range_segment {begin2[j], end2[j]},
+                       callable21<IndexType, type2> {working_ptr2});
+          pool.enqueue(range_segment {begin2[j], end2[j]},
+                       callable22<IndexType, type2> {working_ptr2, test_val2});
+        }
 
-      for (IndexType j = IndexType(0); j < num3; j++) {
-        type3* working_ptr3 = working_array3 + N * j;
-        pool.enqueue(range_segment{ begin3[j], end3[j] },
-            callable31<IndexType, type3>{working_ptr3});
-        pool.enqueue(range_segment{ begin3[j], end3[j] },
-            callable32<IndexType, type3>{working_ptr3, test_val3});
+        for (IndexType j = IndexType(0); j < num3; j++)
+        {
+          type3* working_ptr3 = working_array3 + N * j;
+          pool.enqueue(range_segment {begin3[j], end3[j]},
+                       callable31<IndexType, type3> {working_ptr3});
+          pool.enqueue(range_segment {begin3[j], end3[j]},
+                       callable32<IndexType, type3> {working_ptr3, test_val3});
+        }
       }
-    }
 
-    group = pool.instantiate();
+      group = pool.instantiate();
 
-    for (IndexType gr = 0; gr < group_reuse; gr++) {
-
-      // set_test_data();
+      for (IndexType gr = 0; gr < group_reuse; gr++)
       {
-        for (IndexType j = IndexType(0); j < num1; j++) {
-          type1* test_ptr1 = test_array1 + N * j;
-          for (IndexType i = IndexType(0); i < N; i++) {
-            test_ptr1[i] = type1(0);
+
+        // set_test_data();
+        {
+          for (IndexType j = IndexType(0); j < num1; j++)
+          {
+            type1* test_ptr1 = test_array1 + N * j;
+            for (IndexType i = IndexType(0); i < N; i++)
+            {
+              test_ptr1[i] = type1(0);
+            }
           }
-        }
 
-        for (IndexType j = IndexType(0); j < num2; j++) {
-          type2* test_ptr2 = test_array2 + N * j;
-          for (IndexType i = IndexType(0); i < N; i++) {
-            test_ptr2[i] = type2(0);
+          for (IndexType j = IndexType(0); j < num2; j++)
+          {
+            type2* test_ptr2 = test_array2 + N * j;
+            for (IndexType i = IndexType(0); i < N; i++)
+            {
+              test_ptr2[i] = type2(0);
+            }
           }
-        }
 
-        for (IndexType j = IndexType(0); j < num3; j++) {
-          type3* test_ptr3 = test_array3 + N * j;
-          for (IndexType i = IndexType(0); i < N; i++) {
-            test_ptr3[i] = type3(0);
+          for (IndexType j = IndexType(0); j < num3; j++)
+          {
+            type3* test_ptr3 = test_array3 + N * j;
+            for (IndexType i = IndexType(0); i < N; i++)
+            {
+              test_ptr3[i] = type3(0);
+            }
           }
-        }
 
 
-        res.memcpy(working_array1, test_array1, sizeof(type1) * N * num1);
+          res.memcpy(working_array1, test_array1, sizeof(type1) * N * num1);
 
-        res.memcpy(working_array2, test_array2, sizeof(type2) * N * num2);
+          res.memcpy(working_array2, test_array2, sizeof(type2) * N * num2);
 
-        res.memcpy(working_array3, test_array3, sizeof(type3) * N * num3);
+          res.memcpy(working_array3, test_array3, sizeof(type3) * N * num3);
 
 
-        for (IndexType j = IndexType(0); j < num1; j++) {
-          type1* test_ptr1 = test_array1 + N * j;
-          for (IndexType i = begin1[j]; i < end1[j]; ++i) {
-            test_ptr1[ i ] = type1(i);
+          for (IndexType j = IndexType(0); j < num1; j++)
+          {
+            type1* test_ptr1 = test_array1 + N * j;
+            for (IndexType i = begin1[j]; i < end1[j]; ++i)
+            {
+              test_ptr1[i] = type1(i);
+            }
           }
-        }
 
-        for (IndexType j = IndexType(0); j < num2; j++) {
-          type2* test_ptr2 = test_array2 + N * j;
-          for (IndexType i = begin2[j]; i < end2[j]; ++i) {
-            test_ptr2[ i ] = type2(i);
+          for (IndexType j = IndexType(0); j < num2; j++)
+          {
+            type2* test_ptr2 = test_array2 + N * j;
+            for (IndexType i = begin2[j]; i < end2[j]; ++i)
+            {
+              test_ptr2[i] = type2(i);
+            }
           }
-        }
 
-        for (IndexType j = IndexType(0); j < num3; j++) {
-          type3* test_ptr3 = test_array3 + N * j;
-          for (IndexType i = begin3[j]; i < end3[j]; ++i) {
-            test_ptr3[ i ] = type3(i);
+          for (IndexType j = IndexType(0); j < num3; j++)
+          {
+            type3* test_ptr3 = test_array3 + N * j;
+            for (IndexType i = begin3[j]; i < end3[j]; ++i)
+            {
+              test_ptr3[i] = type3(i);
+            }
           }
         }
-      }
 
-      site = group.run();
+        site = group.run();
 
-      auto e = resource_type::get_default().get_event();
-      e.wait();
+        auto e = resource_type::get_default().get_event();
+        e.wait();
 
-      // check_test_data(type1(5), type2(7), type3(11));
-      {
-        res.memcpy(check_array1, working_array1, sizeof(type1) * N * num1);
+        // check_test_data(type1(5), type2(7), type3(11));
+        {
+          res.memcpy(check_array1, working_array1, sizeof(type1) * N * num1);
 
-        res.memcpy(check_array2, working_array2, sizeof(type2) * N * num2);
+          res.memcpy(check_array2, working_array2, sizeof(type2) * N * num2);
 
-        res.memcpy(check_array3, working_array3, sizeof(type3) * N * num3);
+          res.memcpy(check_array3, working_array3, sizeof(type3) * N * num3);
 
-        res.wait();
+          res.wait();
 
 
-        for (IndexType j = IndexType(0); j < num1; j++) {
-          type1* test_ptr1 = test_array1 + N * j;
-          type1* check_ptr1 = check_array1 + N * j;
-          for (IndexType i = IndexType(0); i < begin1[j]; i++) {
-            ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
-          }
-          for (IndexType i = begin1[j];    i < end1[j];   i++) {
-            ASSERT_EQ(test_ptr1[i] + test_val1, check_ptr1[i]);
+          for (IndexType j = IndexType(0); j < num1; j++)
+          {
+            type1* test_ptr1  = test_array1 + N * j;
+            type1* check_ptr1 = check_array1 + N * j;
+            for (IndexType i = IndexType(0); i < begin1[j]; i++)
+            {
+              ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
+            }
+            for (IndexType i = begin1[j]; i < end1[j]; i++)
+            {
+              ASSERT_EQ(test_ptr1[i] + test_val1, check_ptr1[i]);
+            }
+            for (IndexType i = end1[j]; i < N; i++)
+            {
+              ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
+            }
           }
-          for (IndexType i = end1[j];      i < N;     i++) {
-            ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
-          }
-        }
 
-        for (IndexType j = IndexType(0); j < num2; j++) {
-          type2* test_ptr2 = test_array2 + N * j;
-          type2* check_ptr2 = check_array2 + N * j;
-          for (IndexType i = IndexType(0); i < begin2[j]; i++) {
-            ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
+          for (IndexType j = IndexType(0); j < num2; j++)
+          {
+            type2* test_ptr2  = test_array2 + N * j;
+            type2* check_ptr2 = check_array2 + N * j;
+            for (IndexType i = IndexType(0); i < begin2[j]; i++)
+            {
+              ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
+            }
+            for (IndexType i = begin2[j]; i < end2[j]; i++)
+            {
+              ASSERT_EQ(test_ptr2[i] + test_val2, check_ptr2[i]);
+            }
+            for (IndexType i = end2[j]; i < N; i++)
+            {
+              ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
+            }
           }
-          for (IndexType i = begin2[j];    i < end2[j];   i++) {
-            ASSERT_EQ(test_ptr2[i] + test_val2, check_ptr2[i]);
-          }
-          for (IndexType i = end2[j];      i < N;     i++) {
-            ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
-          }
-        }
 
-        for (IndexType j = IndexType(0); j < num3; j++) {
-          type3* test_ptr3 = test_array3 + N * j;
-          type3* check_ptr3 = check_array3 + N * j;
-          for (IndexType i = IndexType(0); i < begin3[j]; i++) {
-            ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
-          }
-          for (IndexType i = begin3[j];    i < end3[j];   i++) {
-            ASSERT_EQ(test_ptr3[i] + test_val3, check_ptr3[i]);
-          }
-          for (IndexType i = end3[j];      i < N;     i++) {
-            ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
+          for (IndexType j = IndexType(0); j < num3; j++)
+          {
+            type3* test_ptr3  = test_array3 + N * j;
+            type3* check_ptr3 = check_array3 + N * j;
+            for (IndexType i = IndexType(0); i < begin3[j]; i++)
+            {
+              ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
+            }
+            for (IndexType i = begin3[j]; i < end3[j]; i++)
+            {
+              ASSERT_EQ(test_ptr3[i] + test_val3, check_ptr3[i]);
+            }
+            for (IndexType i = end3[j]; i < N; i++)
+            {
+              ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
+            }
           }
         }
       }
-    }
 
-    site.clear();
-    group.clear();
-    pool.clear();
-  }
+      site.clear();
+      group.clear();
+      pool.clear();
+    }
 
 
-  deallocateForallTestData<type1>(working_res,
-                                  working_array1,
-                                  check_array1,
-                                  test_array1);
+    deallocateForallTestData<type1>(working_res, working_array1, check_array1,
+                                    test_array1);
 
-  deallocateForallTestData<type2>(working_res,
-                                  working_array2,
-                                  check_array2,
-                                  test_array2);
+    deallocateForallTestData<type2>(working_res, working_array2, check_array2,
+                                    test_array2);
 
-  deallocateForallTestData<type3>(working_res,
-                                  working_array3,
-                                  check_array3,
-                                  test_array3);
-}
+    deallocateForallTestData<type3>(working_res, working_array3, check_array3,
+                                    test_array3);
+  }
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupOrderedMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                    StoragePolicy,
-                                    detail::indirect_function_call_dispatch_typer,
-                                    IndexType,
-                                    Allocator,
-                                    WORKING_RES> {
-void operator()(
-    std::mt19937&, IndexType, IndexType,
-    IndexType, IndexType, IndexType,
-    IndexType, IndexType) const
-{ }
+          typename WORKING_RES>
+struct testWorkGroupOrderedMultiple<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_function_call_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKING_RES>
+{
+  void operator()(std::mt19937&,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType) const
+  {}
 };
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupOrderedMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                    StoragePolicy,
-                                    detail::indirect_virtual_function_dispatch_typer,
-                                    IndexType,
-                                    Allocator,
-                                    WORKING_RES> {
-void operator()(
-    std::mt19937&, IndexType, IndexType,
-    IndexType, IndexType, IndexType,
-    IndexType, IndexType) const
-{ }
+          typename WORKING_RES>
+struct testWorkGroupOrderedMultiple<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_virtual_function_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKING_RES>
+{
+  void operator()(std::mt19937&,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType) const
+  {}
 };
 
 #endif
 
 
-
 template <typename T>
 class WorkGroupBasicOrderedMultipleReuseFunctionalTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest);
 
 
-TYPED_TEST_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest, BasicWorkGroupOrderedMultipleReuse)
+TYPED_TEST_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest,
+             BasicWorkGroupOrderedMultipleReuse)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
-  using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
+  using ExecPolicy       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy      = typename camp::at<TypeParam, camp::num<1>>::type;
+  using StoragePolicy    = typename camp::at<TypeParam, camp::num<2>>::type;
+  using DispatchTyper    = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IndexType        = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Allocator        = typename camp::at<TypeParam, camp::num<5>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  std::mt19937 rng(std::random_device{}());
+  std::mt19937 rng(std::random_device {}());
   using dist_type = std::uniform_int_distribution<IndexType>;
 
   IndexType num1 = dist_type(IndexType(0), IndexType(8))(rng);
@@ -434,9 +472,11 @@ TYPED_TEST_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest, BasicWorkGroupOrd
   IndexType pool_reuse  = dist_type(IndexType(0), IndexType(8))(rng);
   IndexType group_reuse = dist_type(IndexType(0), IndexType(8))(rng);
 
-  testWorkGroupOrderedMultiple< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper,
-                                IndexType, Allocator, WORKING_RESOURCE >{}(
-      rng, IndexType(96), IndexType(4000), num1, num2, num3, pool_reuse, group_reuse);
+  testWorkGroupOrderedMultiple<ExecPolicy, OrderPolicy, StoragePolicy,
+                               DispatchTyper, IndexType, Allocator,
+                               WORKING_RESOURCE> {}(
+      rng, IndexType(96), IndexType(4000), num1, num2, num3, pool_reuse,
+      group_reuse);
 }
 
 #endif  //__TEST_WORKGROUP_ORDERED_MULTIPLEREUSE__
diff --git a/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp b/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp
index c249b7de65..b0a2ac3734 100644
--- a/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp
@@ -25,149 +25,141 @@ template <typename ExecPolicy,
           typename DispatchTyper,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupOrderedSingle {
-void operator()(IndexType begin, IndexType end) const
+          typename WORKING_RES>
+struct testWorkGroupOrderedSingle
 {
-  ASSERT_GE(begin, (IndexType)0);
-  ASSERT_GE(end, begin);
-  IndexType N = end + begin;
-
-  WORKING_RES res = WORKING_RES::get_default();
-  camp::resources::Resource working_res{res};
-
-  IndexType* working_array;
-  IndexType* check_array;
-  IndexType* test_array;
-
-  allocateForallTestData<IndexType>(N,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
-
-  IndexType const test_val(5);
-
-  using range_segment = RAJA::TypedRangeSegment<IndexType>;
-
-  auto callable1 = [=] RAJA_HOST_DEVICE (IndexType i) {
-        working_array[i] += i;
-      };
-
-  auto callable2 = [=] RAJA_HOST_DEVICE (IndexType i) {
-        working_array[i] += test_val;
-      };
-
-  using DispatchPolicy = typename DispatchTyper::template type<
-      camp::list<range_segment, decltype(callable1)>,
-      camp::list<range_segment, decltype(callable2)> >;
-
-  using WorkPool_type = RAJA::WorkPool<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  using WorkGroup_type = RAJA::WorkGroup<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  using WorkSite_type = RAJA::WorkSite<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
+  void operator()(IndexType begin, IndexType end) const
   {
-    for (IndexType i = IndexType(0); i < N; i++) {
-      test_array[i] = IndexType(0);
-    }
+    ASSERT_GE(begin, (IndexType)0);
+    ASSERT_GE(end, begin);
+    IndexType N = end + begin;
 
-    res.memcpy(working_array, test_array, sizeof(IndexType) * N);
+    WORKING_RES res = WORKING_RES::get_default();
+    camp::resources::Resource working_res {res};
 
-    for (IndexType i = begin; i < end; ++i) {
-      test_array[ i ] = IndexType(i);
-    }
-  }
+    IndexType* working_array;
+    IndexType* check_array;
+    IndexType* test_array;
 
-  WorkPool_type pool(Allocator{});
+    allocateForallTestData<IndexType>(N, working_res, &working_array,
+                                      &check_array, &test_array);
 
-  {
-    pool.enqueue(range_segment{ begin, end }, callable1);
-    pool.enqueue(range_segment{ begin, end }, callable2);
-  }
+    IndexType const test_val(5);
 
-  WorkGroup_type group = pool.instantiate();
+    using range_segment = RAJA::TypedRangeSegment<IndexType>;
 
-  WorkSite_type site = group.run(res);
+    auto callable1 = [=] RAJA_HOST_DEVICE(IndexType i)
+    { working_array[i] += i; };
 
-  {
-    res.memcpy(check_array, working_array, sizeof(IndexType) * N);
-    res.wait();
+    auto callable2 = [=] RAJA_HOST_DEVICE(IndexType i)
+    { working_array[i] += test_val; };
+
+    using DispatchPolicy = typename DispatchTyper::template type<
+        camp::list<range_segment, decltype(callable1)>,
+        camp::list<range_segment, decltype(callable2)>>;
+
+    using WorkPool_type =
+        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
+
+    using WorkGroup_type =
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                              StoragePolicy, DispatchPolicy>,
+                        IndexType, RAJA::xargs<>, Allocator>;
+
+    using WorkSite_type =
+        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
+
+    {
+      for (IndexType i = IndexType(0); i < N; i++)
+      {
+        test_array[i] = IndexType(0);
+      }
 
-    for (IndexType i = IndexType(0); i < begin; i++) {
-      ASSERT_EQ(test_array[i], check_array[i]);
+      res.memcpy(working_array, test_array, sizeof(IndexType) * N);
+
+      for (IndexType i = begin; i < end; ++i)
+      {
+        test_array[i] = IndexType(i);
+      }
     }
-    for (IndexType i = begin;        i < end;   i++) {
-      ASSERT_EQ(test_array[i] + test_val, check_array[i]);
+
+    WorkPool_type pool(Allocator {});
+
+    {
+      pool.enqueue(range_segment {begin, end}, callable1);
+      pool.enqueue(range_segment {begin, end}, callable2);
     }
-    for (IndexType i = end;          i < N;     i++) {
-      ASSERT_EQ(test_array[i], check_array[i]);
+
+    WorkGroup_type group = pool.instantiate();
+
+    WorkSite_type site = group.run(res);
+
+    {
+      res.memcpy(check_array, working_array, sizeof(IndexType) * N);
+      res.wait();
+
+      for (IndexType i = IndexType(0); i < begin; i++)
+      {
+        ASSERT_EQ(test_array[i], check_array[i]);
+      }
+      for (IndexType i = begin; i < end; i++)
+      {
+        ASSERT_EQ(test_array[i] + test_val, check_array[i]);
+      }
+      for (IndexType i = end; i < N; i++)
+      {
+        ASSERT_EQ(test_array[i], check_array[i]);
+      }
     }
-  }
 
 
-  deallocateForallTestData<IndexType>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
-}
+    deallocateForallTestData<IndexType>(working_res, working_array, check_array,
+                                        test_array);
+  }
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupOrderedSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                  RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                  StoragePolicy,
-                                  detail::indirect_function_call_dispatch_typer,
-                                  IndexType,
-                                  Allocator,
-                                  WORKING_RES> {
-void operator()(
-    IndexType, IndexType) const
-{ }
+          typename WORKING_RES>
+struct testWorkGroupOrderedSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_function_call_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKING_RES>
+{
+  void operator()(IndexType, IndexType) const {}
 };
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupOrderedSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                  RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                  StoragePolicy,
-                                  detail::indirect_virtual_function_dispatch_typer,
-                                  IndexType,
-                                  Allocator,
-                                  WORKING_RES> {
-void operator()(
-    IndexType, IndexType) const
-{ }
+          typename WORKING_RES>
+struct testWorkGroupOrderedSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_virtual_function_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKING_RES>
+{
+  void operator()(IndexType, IndexType) const {}
 };
 
 #endif
@@ -175,23 +167,23 @@ void operator()(
 
 template <typename T>
 class WorkGroupBasicOrderedSingleFunctionalTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicOrderedSingleFunctionalTest);
 
 
-TYPED_TEST_P(WorkGroupBasicOrderedSingleFunctionalTest, BasicWorkGroupOrderedSingle)
+TYPED_TEST_P(WorkGroupBasicOrderedSingleFunctionalTest,
+             BasicWorkGroupOrderedSingle)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
-  using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
+  using ExecPolicy       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy      = typename camp::at<TypeParam, camp::num<1>>::type;
+  using StoragePolicy    = typename camp::at<TypeParam, camp::num<2>>::type;
+  using DispatchTyper    = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IndexType        = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Allocator        = typename camp::at<TypeParam, camp::num<5>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  std::mt19937 rng(std::random_device{}());
+  std::mt19937 rng(std::random_device {}());
   using dist_type = std::uniform_int_distribution<IndexType>;
 
   IndexType b1 = dist_type(IndexType(0), IndexType(15))(rng);
@@ -203,9 +195,15 @@ TYPED_TEST_P(WorkGroupBasicOrderedSingleFunctionalTest, BasicWorkGroupOrderedSin
   IndexType b3 = dist_type(e2, IndexType(1023))(rng);
   IndexType e3 = dist_type(b3, IndexType(1024))(rng);
 
-  testWorkGroupOrderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b1, e1);
-  testWorkGroupOrderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b2, e2);
-  testWorkGroupOrderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b3, e3);
+  testWorkGroupOrderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                             DispatchTyper, IndexType, Allocator,
+                             WORKING_RESOURCE> {}(b1, e1);
+  testWorkGroupOrderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                             DispatchTyper, IndexType, Allocator,
+                             WORKING_RESOURCE> {}(b2, e2);
+  testWorkGroupOrderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                             DispatchTyper, IndexType, Allocator,
+                             WORKING_RESOURCE> {}(b3, e3);
 }
 
 #endif  //__TEST_WORKGROUP_ORDERED_SINGLE__
diff --git a/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp b/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp
index 4207294bcf..c2265c3a96 100644
--- a/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp
@@ -19,30 +19,36 @@
 
 
 // These are defined here due to cuda limitations
-template < typename IndexType, typename type1 >
-struct callable1 {
+template <typename IndexType, typename type1>
+struct callable1
+{
   type1* working_ptr1;
   type1 const test_val1;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const {
-        working_ptr1[i] += type1(i) + test_val1;
+  RAJA_HOST_DEVICE void operator()(IndexType i) const
+  {
+    working_ptr1[i] += type1(i) + test_val1;
   }
 };
 
-template < typename IndexType, typename type2 >
-struct callable2 {
+template <typename IndexType, typename type2>
+struct callable2
+{
   type2* working_ptr2;
   type2 const test_val2;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const {
-        working_ptr2[i] += type2(i) + test_val2;
+  RAJA_HOST_DEVICE void operator()(IndexType i) const
+  {
+    working_ptr2[i] += type2(i) + test_val2;
   }
 };
 
-template < typename IndexType, typename type3 >
-struct callable3 {
+template <typename IndexType, typename type3>
+struct callable3
+{
   type3* working_ptr3;
   type3 const test_val3;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const {
-        working_ptr3[i] += type3(i) + test_val3;
+  RAJA_HOST_DEVICE void operator()(IndexType i) const
+  {
+    working_ptr3[i] += type3(i) + test_val3;
   }
 };
 
@@ -53,335 +59,363 @@ template <typename ExecPolicy,
           typename DispatchTyper,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupUnorderedMultiple {
-void operator()(
-    std::mt19937& rng, IndexType max_begin, IndexType min_end,
-    IndexType num1, IndexType num2, IndexType num3,
-    IndexType pool_reuse, IndexType group_reuse) const
+          typename WORKING_RES>
+struct testWorkGroupUnorderedMultiple
 {
-  ASSERT_GT(min_end, max_begin);
-  IndexType N = min_end + max_begin;
+  void operator()(std::mt19937& rng,
+                  IndexType max_begin,
+                  IndexType min_end,
+                  IndexType num1,
+                  IndexType num2,
+                  IndexType num3,
+                  IndexType pool_reuse,
+                  IndexType group_reuse) const
+  {
+    ASSERT_GT(min_end, max_begin);
+    IndexType N = min_end + max_begin;
 
-  std::vector<IndexType> begin1, end1;
-  std::vector<IndexType> begin2, end2;
-  std::vector<IndexType> begin3, end3;
+    std::vector<IndexType> begin1, end1;
+    std::vector<IndexType> begin2, end2;
+    std::vector<IndexType> begin3, end3;
 
-  {
-    using dist_type = std::uniform_int_distribution<IndexType>;
+    {
+      using dist_type = std::uniform_int_distribution<IndexType>;
 
-    for (IndexType j = IndexType(0); j < num1; j++) {
-      begin1.push_back(dist_type(max_begin, min_end-1)(rng));
-      end1.push_back(dist_type(begin1.back(), min_end)(rng));
-    }
+      for (IndexType j = IndexType(0); j < num1; j++)
+      {
+        begin1.push_back(dist_type(max_begin, min_end - 1)(rng));
+        end1.push_back(dist_type(begin1.back(), min_end)(rng));
+      }
 
-    for (IndexType j = IndexType(0); j < num2; j++) {
-      begin2.push_back(dist_type(max_begin, min_end-1)(rng));
-      end2.push_back(dist_type(begin2.back(), min_end)(rng));
-    }
+      for (IndexType j = IndexType(0); j < num2; j++)
+      {
+        begin2.push_back(dist_type(max_begin, min_end - 1)(rng));
+        end2.push_back(dist_type(begin2.back(), min_end)(rng));
+      }
 
-    for (IndexType j = IndexType(0); j < num3; j++) {
-      begin3.push_back(dist_type(max_begin, min_end-1)(rng));
-      end3.push_back(dist_type(begin3.back(), min_end)(rng));
+      for (IndexType j = IndexType(0); j < num3; j++)
+      {
+        begin3.push_back(dist_type(max_begin, min_end - 1)(rng));
+        end3.push_back(dist_type(begin3.back(), min_end)(rng));
+      }
     }
-  }
 
-  WORKING_RES res = WORKING_RES::get_default();
-  camp::resources::Resource working_res{res};
+    WORKING_RES res = WORKING_RES::get_default();
+    camp::resources::Resource working_res {res};
 
-  using type1 = IndexType;
-  using type2 = size_t;
-  using type3 = double;
+    using type1 = IndexType;
+    using type2 = size_t;
+    using type3 = double;
 
-  type1* working_array1 = nullptr;
-  type1* check_array1 = nullptr;
-  type1* test_array1 = nullptr;
+    type1* working_array1 = nullptr;
+    type1* check_array1   = nullptr;
+    type1* test_array1    = nullptr;
 
-  type2* working_array2 = nullptr;
-  type2* check_array2 = nullptr;
-  type2* test_array2 = nullptr;
+    type2* working_array2 = nullptr;
+    type2* check_array2   = nullptr;
+    type2* test_array2    = nullptr;
 
-  type3* working_array3 = nullptr;
-  type3* check_array3 = nullptr;
-  type3* test_array3 = nullptr;
+    type3* working_array3 = nullptr;
+    type3* check_array3   = nullptr;
+    type3* test_array3    = nullptr;
 
-  allocateForallTestData<type1>(N * num1,
-                                working_res,
-                                &working_array1,
-                                &check_array1,
-                                &test_array1);
+    allocateForallTestData<type1>(N * num1, working_res, &working_array1,
+                                  &check_array1, &test_array1);
 
-  allocateForallTestData<type2>(N * num2,
-                                working_res,
-                                &working_array2,
-                                &check_array2,
-                                &test_array2);
+    allocateForallTestData<type2>(N * num2, working_res, &working_array2,
+                                  &check_array2, &test_array2);
 
-  allocateForallTestData<type3>(N * num3,
-                                working_res,
-                                &working_array3,
-                                &check_array3,
-                                &test_array3);
+    allocateForallTestData<type3>(N * num3, working_res, &working_array3,
+                                  &check_array3, &test_array3);
 
-  type1 const test_val1(5);
-  type2 const test_val2(7);
-  type3 const test_val3(11);
+    type1 const test_val1(5);
+    type2 const test_val2(7);
+    type3 const test_val3(11);
 
-  using range_segment = RAJA::TypedRangeSegment<IndexType>;
+    using range_segment = RAJA::TypedRangeSegment<IndexType>;
 
-  using DispatchPolicy = typename DispatchTyper::template type<
-      camp::list<range_segment, callable1<IndexType, type1>>,
-      camp::list<range_segment, callable2<IndexType, type2>>,
-      camp::list<range_segment, callable3<IndexType, type3>> >;
+    using DispatchPolicy = typename DispatchTyper::template type<
+        camp::list<range_segment, callable1<IndexType, type1>>,
+        camp::list<range_segment, callable2<IndexType, type2>>,
+        camp::list<range_segment, callable3<IndexType, type3>>>;
 
-  using WorkPool_type = RAJA::WorkPool<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
+    using WorkPool_type =
+        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
-  using WorkGroup_type = RAJA::WorkGroup<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  using WorkSite_type = RAJA::WorkSite<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
+    using WorkGroup_type =
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                              StoragePolicy, DispatchPolicy>,
+                        IndexType, RAJA::xargs<>, Allocator>;
 
-  WorkPool_type pool(Allocator{});
+    using WorkSite_type =
+        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
-  for (IndexType pr = 0; pr < pool_reuse; pr++) {
+    WorkPool_type pool(Allocator {});
 
-    // fill_pool(pool, type1(5), type2(7), type3(11));
+    for (IndexType pr = 0; pr < pool_reuse; pr++)
     {
-      for (IndexType j = IndexType(0); j < num1; j++) {
-        type1* working_ptr1 = working_array1 + N * j;
-        pool.enqueue(range_segment{ begin1[j], end1[j] },
-            callable1<IndexType, type1>{working_ptr1, test_val1});
-      }
 
-      for (IndexType j = IndexType(0); j < num2; j++) {
-        type2* working_ptr2 = working_array2 + N * j;
-        pool.enqueue(range_segment{ begin2[j], end2[j] },
-            callable2<IndexType, type2>{working_ptr2, test_val2});
-      }
+      // fill_pool(pool, type1(5), type2(7), type3(11));
+      {
+        for (IndexType j = IndexType(0); j < num1; j++)
+        {
+          type1* working_ptr1 = working_array1 + N * j;
+          pool.enqueue(range_segment {begin1[j], end1[j]},
+                       callable1<IndexType, type1> {working_ptr1, test_val1});
+        }
 
-      for (IndexType j = IndexType(0); j < num3; j++) {
-        type3* working_ptr3 = working_array3 + N * j;
-        pool.enqueue(range_segment{ begin3[j], end3[j] },
-            callable3<IndexType, type3>{working_ptr3, test_val3});
-      }
-    }
+        for (IndexType j = IndexType(0); j < num2; j++)
+        {
+          type2* working_ptr2 = working_array2 + N * j;
+          pool.enqueue(range_segment {begin2[j], end2[j]},
+                       callable2<IndexType, type2> {working_ptr2, test_val2});
+        }
 
-    WorkGroup_type group = pool.instantiate();
+        for (IndexType j = IndexType(0); j < num3; j++)
+        {
+          type3* working_ptr3 = working_array3 + N * j;
+          pool.enqueue(range_segment {begin3[j], end3[j]},
+                       callable3<IndexType, type3> {working_ptr3, test_val3});
+        }
+      }
 
-    for (IndexType gr = 0; gr < group_reuse; gr++) {
+      WorkGroup_type group = pool.instantiate();
 
-      // set_test_data();
+      for (IndexType gr = 0; gr < group_reuse; gr++)
       {
-        for (IndexType j = IndexType(0); j < num1; j++) {
-          type1* test_ptr1 = test_array1 + N * j;
-          for (IndexType i = IndexType(0); i < N; i++) {
-            test_ptr1[i] = type1(0);
+
+        // set_test_data();
+        {
+          for (IndexType j = IndexType(0); j < num1; j++)
+          {
+            type1* test_ptr1 = test_array1 + N * j;
+            for (IndexType i = IndexType(0); i < N; i++)
+            {
+              test_ptr1[i] = type1(0);
+            }
           }
-        }
 
-        for (IndexType j = IndexType(0); j < num2; j++) {
-          type2* test_ptr2 = test_array2 + N * j;
-          for (IndexType i = IndexType(0); i < N; i++) {
-            test_ptr2[i] = type2(0);
+          for (IndexType j = IndexType(0); j < num2; j++)
+          {
+            type2* test_ptr2 = test_array2 + N * j;
+            for (IndexType i = IndexType(0); i < N; i++)
+            {
+              test_ptr2[i] = type2(0);
+            }
           }
-        }
 
-        for (IndexType j = IndexType(0); j < num3; j++) {
-          type3* test_ptr3 = test_array3 + N * j;
-          for (IndexType i = IndexType(0); i < N; i++) {
-            test_ptr3[i] = type3(0);
+          for (IndexType j = IndexType(0); j < num3; j++)
+          {
+            type3* test_ptr3 = test_array3 + N * j;
+            for (IndexType i = IndexType(0); i < N; i++)
+            {
+              test_ptr3[i] = type3(0);
+            }
           }
-        }
 
 
-        res.memcpy(working_array1, test_array1, sizeof(type1) * N * num1);
+          res.memcpy(working_array1, test_array1, sizeof(type1) * N * num1);
 
-        res.memcpy(working_array2, test_array2, sizeof(type2) * N * num2);
+          res.memcpy(working_array2, test_array2, sizeof(type2) * N * num2);
 
-        res.memcpy(working_array3, test_array3, sizeof(type3) * N * num3);
+          res.memcpy(working_array3, test_array3, sizeof(type3) * N * num3);
 
 
-        for (IndexType j = IndexType(0); j < num1; j++) {
-          type1* test_ptr1 = test_array1 + N * j;
-          for (IndexType i = begin1[j]; i < end1[j]; ++i) {
-            test_ptr1[ i ] = type1(i);
+          for (IndexType j = IndexType(0); j < num1; j++)
+          {
+            type1* test_ptr1 = test_array1 + N * j;
+            for (IndexType i = begin1[j]; i < end1[j]; ++i)
+            {
+              test_ptr1[i] = type1(i);
+            }
           }
-        }
 
-        for (IndexType j = IndexType(0); j < num2; j++) {
-          type2* test_ptr2 = test_array2 + N * j;
-          for (IndexType i = begin2[j]; i < end2[j]; ++i) {
-            test_ptr2[ i ] = type2(i);
+          for (IndexType j = IndexType(0); j < num2; j++)
+          {
+            type2* test_ptr2 = test_array2 + N * j;
+            for (IndexType i = begin2[j]; i < end2[j]; ++i)
+            {
+              test_ptr2[i] = type2(i);
+            }
           }
-        }
 
-        for (IndexType j = IndexType(0); j < num3; j++) {
-          type3* test_ptr3 = test_array3 + N * j;
-          for (IndexType i = begin3[j]; i < end3[j]; ++i) {
-            test_ptr3[ i ] = type3(i);
+          for (IndexType j = IndexType(0); j < num3; j++)
+          {
+            type3* test_ptr3 = test_array3 + N * j;
+            for (IndexType i = begin3[j]; i < end3[j]; ++i)
+            {
+              test_ptr3[i] = type3(i);
+            }
           }
         }
-      }
 
-      WorkSite_type site = group.run(res);
+        WorkSite_type site = group.run(res);
 
-      // check_test_data(type1(5), type2(7), type3(11));
-      {
-        res.memcpy(check_array1, working_array1, sizeof(type1) * N * num1);
+        // check_test_data(type1(5), type2(7), type3(11));
+        {
+          res.memcpy(check_array1, working_array1, sizeof(type1) * N * num1);
 
-        res.memcpy(check_array2, working_array2, sizeof(type2) * N * num2);
+          res.memcpy(check_array2, working_array2, sizeof(type2) * N * num2);
 
-        res.memcpy(check_array3, working_array3, sizeof(type3) * N * num3);
+          res.memcpy(check_array3, working_array3, sizeof(type3) * N * num3);
 
-        res.wait();
+          res.wait();
 
 
-        for (IndexType j = IndexType(0); j < num1; j++) {
-          type1* test_ptr1 = test_array1 + N * j;
-          type1* check_ptr1 = check_array1 + N * j;
-          for (IndexType i = IndexType(0); i < begin1[j]; i++) {
-            ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
+          for (IndexType j = IndexType(0); j < num1; j++)
+          {
+            type1* test_ptr1  = test_array1 + N * j;
+            type1* check_ptr1 = check_array1 + N * j;
+            for (IndexType i = IndexType(0); i < begin1[j]; i++)
+            {
+              ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
+            }
+            for (IndexType i = begin1[j]; i < end1[j]; i++)
+            {
+              ASSERT_EQ(test_ptr1[i] + test_val1, check_ptr1[i]);
+            }
+            for (IndexType i = end1[j]; i < N; i++)
+            {
+              ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
+            }
           }
-          for (IndexType i = begin1[j];    i < end1[j];   i++) {
-            ASSERT_EQ(test_ptr1[i] + test_val1, check_ptr1[i]);
-          }
-          for (IndexType i = end1[j];      i < N;     i++) {
-            ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
-          }
-        }
 
-        for (IndexType j = IndexType(0); j < num2; j++) {
-          type2* test_ptr2 = test_array2 + N * j;
-          type2* check_ptr2 = check_array2 + N * j;
-          for (IndexType i = IndexType(0); i < begin2[j]; i++) {
-            ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
-          }
-          for (IndexType i = begin2[j];    i < end2[j];   i++) {
-            ASSERT_EQ(test_ptr2[i] + test_val2, check_ptr2[i]);
+          for (IndexType j = IndexType(0); j < num2; j++)
+          {
+            type2* test_ptr2  = test_array2 + N * j;
+            type2* check_ptr2 = check_array2 + N * j;
+            for (IndexType i = IndexType(0); i < begin2[j]; i++)
+            {
+              ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
+            }
+            for (IndexType i = begin2[j]; i < end2[j]; i++)
+            {
+              ASSERT_EQ(test_ptr2[i] + test_val2, check_ptr2[i]);
+            }
+            for (IndexType i = end2[j]; i < N; i++)
+            {
+              ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
+            }
           }
-          for (IndexType i = end2[j];      i < N;     i++) {
-            ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
-          }
-        }
 
-        for (IndexType j = IndexType(0); j < num3; j++) {
-          type3* test_ptr3 = test_array3 + N * j;
-          type3* check_ptr3 = check_array3 + N * j;
-          for (IndexType i = IndexType(0); i < begin3[j]; i++) {
-            ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
-          }
-          for (IndexType i = begin3[j];    i < end3[j];   i++) {
-            ASSERT_EQ(test_ptr3[i] + test_val3, check_ptr3[i]);
-          }
-          for (IndexType i = end3[j];      i < N;     i++) {
-            ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
+          for (IndexType j = IndexType(0); j < num3; j++)
+          {
+            type3* test_ptr3  = test_array3 + N * j;
+            type3* check_ptr3 = check_array3 + N * j;
+            for (IndexType i = IndexType(0); i < begin3[j]; i++)
+            {
+              ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
+            }
+            for (IndexType i = begin3[j]; i < end3[j]; i++)
+            {
+              ASSERT_EQ(test_ptr3[i] + test_val3, check_ptr3[i]);
+            }
+            for (IndexType i = end3[j]; i < N; i++)
+            {
+              ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
+            }
           }
         }
       }
-    }
 
-    pool.clear();
-  }
+      pool.clear();
+    }
 
 
-  deallocateForallTestData<type1>(working_res,
-                                  working_array1,
-                                  check_array1,
-                                  test_array1);
+    deallocateForallTestData<type1>(working_res, working_array1, check_array1,
+                                    test_array1);
 
-  deallocateForallTestData<type2>(working_res,
-                                  working_array2,
-                                  check_array2,
-                                  test_array2);
+    deallocateForallTestData<type2>(working_res, working_array2, check_array2,
+                                    test_array2);
 
-  deallocateForallTestData<type3>(working_res,
-                                  working_array3,
-                                  check_array3,
-                                  test_array3);
-}
+    deallocateForallTestData<type3>(working_res, working_array3, check_array3,
+                                    test_array3);
+  }
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupUnorderedMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                      RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                      StoragePolicy,
-                                      detail::indirect_function_call_dispatch_typer,
-                                      IndexType,
-                                      Allocator,
-                                      WORKING_RES> {
-void operator()(
-    std::mt19937&, IndexType, IndexType,
-    IndexType, IndexType, IndexType,
-    IndexType, IndexType) const
-{ }
+          typename WORKING_RES>
+struct testWorkGroupUnorderedMultiple<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_function_call_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKING_RES>
+{
+  void operator()(std::mt19937&,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType) const
+  {}
 };
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupUnorderedMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                      RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                      StoragePolicy,
-                                      detail::indirect_virtual_function_dispatch_typer,
-                                      IndexType,
-                                      Allocator,
-                                      WORKING_RES> {
-void operator()(
-    std::mt19937&, IndexType, IndexType,
-    IndexType, IndexType, IndexType,
-    IndexType, IndexType) const
-{ }
+          typename WORKING_RES>
+struct testWorkGroupUnorderedMultiple<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_virtual_function_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKING_RES>
+{
+  void operator()(std::mt19937&,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType) const
+  {}
 };
 
 #endif
 
 
 template <typename T>
-class WorkGroupBasicUnorderedMultipleReuseFunctionalTest : public ::testing::Test
-{
-};
+class WorkGroupBasicUnorderedMultipleReuseFunctionalTest
+    : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest);
 
 
-TYPED_TEST_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest, BasicWorkGroupUnorderedMultipleReuse)
+TYPED_TEST_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest,
+             BasicWorkGroupUnorderedMultipleReuse)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
-  using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
+  using ExecPolicy       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy      = typename camp::at<TypeParam, camp::num<1>>::type;
+  using StoragePolicy    = typename camp::at<TypeParam, camp::num<2>>::type;
+  using DispatchTyper    = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IndexType        = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Allocator        = typename camp::at<TypeParam, camp::num<5>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  std::mt19937 rng(std::random_device{}());
+  std::mt19937 rng(std::random_device {}());
   using dist_type = std::uniform_int_distribution<IndexType>;
 
   IndexType num1 = dist_type(IndexType(0), IndexType(8))(rng);
@@ -391,9 +425,11 @@ TYPED_TEST_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest, BasicWorkGroupU
   IndexType pool_reuse  = dist_type(IndexType(0), IndexType(8))(rng);
   IndexType group_reuse = dist_type(IndexType(0), IndexType(8))(rng);
 
-  testWorkGroupUnorderedMultiple< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper,
-                                  IndexType, Allocator, WORKING_RESOURCE >{}(
-      rng, IndexType(96), IndexType(4000), num1, num2, num3, pool_reuse, group_reuse);
+  testWorkGroupUnorderedMultiple<ExecPolicy, OrderPolicy, StoragePolicy,
+                                 DispatchTyper, IndexType, Allocator,
+                                 WORKING_RESOURCE> {}(
+      rng, IndexType(96), IndexType(4000), num1, num2, num3, pool_reuse,
+      group_reuse);
 }
 
 #endif  //__TEST_WORKGROUP_UNORDERED_MULTIPLEREUSE__
diff --git a/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp b/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp
index 84d44dd496..629bccdb0d 100644
--- a/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp
@@ -24,150 +24,143 @@ template <typename ExecPolicy,
           typename DispatchTyper,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupUnorderedSingle {
-void operator()(IndexType begin, IndexType end) const
+          typename WORKING_RES>
+struct testWorkGroupUnorderedSingle
 {
+  void operator()(IndexType begin, IndexType end) const
+  {
 
-  ASSERT_GE(begin, (IndexType)0);
-  ASSERT_GE(end, begin);
-  IndexType N = end + begin;
+    ASSERT_GE(begin, (IndexType)0);
+    ASSERT_GE(end, begin);
+    IndexType N = end + begin;
 
-  WORKING_RES res = WORKING_RES::get_default();
-  camp::resources::Resource working_res{res};
+    WORKING_RES res = WORKING_RES::get_default();
+    camp::resources::Resource working_res {res};
 
-  IndexType* working_array;
-  IndexType* check_array;
-  IndexType* test_array;
+    IndexType* working_array;
+    IndexType* check_array;
+    IndexType* test_array;
 
-  allocateForallTestData<IndexType>(N,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+    allocateForallTestData<IndexType>(N, working_res, &working_array,
+                                      &check_array, &test_array);
 
-  IndexType const test_val(5);
+    IndexType const test_val(5);
 
-  using range_segment = RAJA::TypedRangeSegment<IndexType>;
+    using range_segment = RAJA::TypedRangeSegment<IndexType>;
 
-  auto callable = [=] RAJA_HOST_DEVICE (IndexType i) {
-        working_array[i] += i + test_val;
-      };
+    auto callable = [=] RAJA_HOST_DEVICE(IndexType i)
+    { working_array[i] += i + test_val; };
 
-  using DispatchPolicy = typename DispatchTyper::template type<
-      camp::list<range_segment, decltype(callable)> >;
+    using DispatchPolicy = typename DispatchTyper::template type<
+        camp::list<range_segment, decltype(callable)>>;
 
-  using WorkPool_type = RAJA::WorkPool<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
+    using WorkPool_type =
+        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
-  using WorkGroup_type = RAJA::WorkGroup<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
+    using WorkGroup_type =
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                              StoragePolicy, DispatchPolicy>,
+                        IndexType, RAJA::xargs<>, Allocator>;
 
-  using WorkSite_type = RAJA::WorkSite<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
+    using WorkSite_type =
+        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
-  using resource_type = typename WorkSite_type::resource_type;
-  static_assert(std::is_same<WORKING_RES, resource_type>::value,
-                "Expected same resource types");
+    using resource_type = typename WorkSite_type::resource_type;
+    static_assert(std::is_same<WORKING_RES, resource_type>::value,
+                  "Expected same resource types");
 
-  {
-    for (IndexType i = IndexType(0); i < N; i++) {
-      test_array[i] = IndexType(0);
-    }
+    {
+      for (IndexType i = IndexType(0); i < N; i++)
+      {
+        test_array[i] = IndexType(0);
+      }
 
-    res.memcpy(working_array, test_array, sizeof(IndexType) * N);
+      res.memcpy(working_array, test_array, sizeof(IndexType) * N);
 
-    for (IndexType i = begin; i < end; ++i) {
-      test_array[ i ] = IndexType(i);
+      for (IndexType i = begin; i < end; ++i)
+      {
+        test_array[i] = IndexType(i);
+      }
     }
-  }
 
-  WorkPool_type pool(Allocator{});
+    WorkPool_type pool(Allocator {});
 
-  {
-    pool.enqueue(range_segment{ begin, end }, callable);
-  }
+    {
+      pool.enqueue(range_segment {begin, end}, callable);
+    }
 
-  WorkGroup_type group = pool.instantiate();
+    WorkGroup_type group = pool.instantiate();
 
-  WorkSite_type site = group.run();
+    WorkSite_type site = group.run();
 
-  auto e = site.get_resource().get_event();
-  e.wait();
+    auto e = site.get_resource().get_event();
+    e.wait();
 
-  {
-    res.memcpy(check_array, working_array, sizeof(IndexType) * N);
+    {
+      res.memcpy(check_array, working_array, sizeof(IndexType) * N);
 
-    for (IndexType i = IndexType(0); i < begin; i++) {
-      ASSERT_EQ(test_array[i], check_array[i]);
-    }
-    for (IndexType i = begin;        i < end;   i++) {
-      ASSERT_EQ(test_array[i] + test_val, check_array[i]);
+      for (IndexType i = IndexType(0); i < begin; i++)
+      {
+        ASSERT_EQ(test_array[i], check_array[i]);
+      }
+      for (IndexType i = begin; i < end; i++)
+      {
+        ASSERT_EQ(test_array[i] + test_val, check_array[i]);
+      }
+      for (IndexType i = end; i < N; i++)
+      {
+        ASSERT_EQ(test_array[i], check_array[i]);
+      }
     }
-    for (IndexType i = end;          i < N;     i++) {
-      ASSERT_EQ(test_array[i], check_array[i]);
-    }
-  }
 
 
-  deallocateForallTestData<IndexType>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
-}
+    deallocateForallTestData<IndexType>(working_res, working_array, check_array,
+                                        test_array);
+  }
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupUnorderedSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                    StoragePolicy,
-                                    detail::indirect_function_call_dispatch_typer,
-                                    IndexType,
-                                    Allocator,
-                                    WORKING_RES> {
-void operator()(
-    IndexType, IndexType) const
-{ }
+          typename WORKING_RES>
+struct testWorkGroupUnorderedSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_function_call_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKING_RES>
+{
+  void operator()(IndexType, IndexType) const {}
 };
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupUnorderedSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                    StoragePolicy,
-                                    detail::indirect_virtual_function_dispatch_typer,
-                                    IndexType,
-                                    Allocator,
-                                    WORKING_RES> {
-void operator()(
-    IndexType, IndexType) const
-{ }
+          typename WORKING_RES>
+struct testWorkGroupUnorderedSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_virtual_function_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKING_RES>
+{
+  void operator()(IndexType, IndexType) const {}
 };
 
 #endif
@@ -175,23 +168,23 @@ void operator()(
 
 template <typename T>
 class WorkGroupBasicUnorderedSingleFunctionalTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicUnorderedSingleFunctionalTest);
 
 
-TYPED_TEST_P(WorkGroupBasicUnorderedSingleFunctionalTest, BasicWorkGroupUnorderedSingle)
+TYPED_TEST_P(WorkGroupBasicUnorderedSingleFunctionalTest,
+             BasicWorkGroupUnorderedSingle)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
-  using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
+  using ExecPolicy       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy      = typename camp::at<TypeParam, camp::num<1>>::type;
+  using StoragePolicy    = typename camp::at<TypeParam, camp::num<2>>::type;
+  using DispatchTyper    = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IndexType        = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Allocator        = typename camp::at<TypeParam, camp::num<5>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  std::mt19937 rng(std::random_device{}());
+  std::mt19937 rng(std::random_device {}());
   using dist_type = std::uniform_int_distribution<IndexType>;
 
   IndexType b1 = dist_type(IndexType(0), IndexType(15))(rng);
@@ -203,9 +196,15 @@ TYPED_TEST_P(WorkGroupBasicUnorderedSingleFunctionalTest, BasicWorkGroupUnordere
   IndexType b3 = dist_type(e2, IndexType(1023))(rng);
   IndexType e3 = dist_type(b3, IndexType(1024))(rng);
 
-  testWorkGroupUnorderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b1, e1);
-  testWorkGroupUnorderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b2, e2);
-  testWorkGroupUnorderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b3, e3);
+  testWorkGroupUnorderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                               DispatchTyper, IndexType, Allocator,
+                               WORKING_RESOURCE> {}(b1, e1);
+  testWorkGroupUnorderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                               DispatchTyper, IndexType, Allocator,
+                               WORKING_RESOURCE> {}(b2, e2);
+  testWorkGroupUnorderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                               DispatchTyper, IndexType, Allocator,
+                               WORKING_RESOURCE> {}(b3, e3);
 }
 
 #endif  //__TEST_WORKGROUP_UNORDERED_SINGLE__
diff --git a/test/include/RAJA_gtest.hpp b/test/include/RAJA_gtest.hpp
index a699171a94..7a96d914ae 100644
--- a/test/include/RAJA_gtest.hpp
+++ b/test/include/RAJA_gtest.hpp
@@ -21,110 +21,128 @@
 #ifdef RAJA_COMPILER_MSVC
 // disable some warnings for MSVC that we can't control, because they're emitted
 // by googletest headers
-#pragma warning( disable : 4244 )  // Force msvc to not emit conversion warning
-#pragma warning( disable : 4389 )  // Force msvc to not emit conversion warning
+#pragma warning(disable : 4244)  // Force msvc to not emit conversion warning
+#pragma warning(disable : 4389)  // Force msvc to not emit conversion warning
 #endif
 
 #include "gtest/gtest.h"
 
-#define GPU_TEST(X, Y)                 \
-  static void gpu_test_##X##_##Y();    \
-  TEST(X, Y) { gpu_test_##X##_##Y(); } \
+#define GPU_TEST(X, Y)                                                         \
+  static void gpu_test_##X##_##Y();                                            \
+  TEST(X, Y) { gpu_test_##X##_##Y(); }                                         \
   static void gpu_test_##X##_##Y()
 
-#define GPU_TEST_F(test_fixture, test_name)                  \
-  static void gpu_test_f_##test_fixture##_##test_name();     \
-  GTEST_TEST_(test_fixture,                                   \
-              test_name,                                      \
-              test_fixture,                                   \
-              ::testing::internal::GetTypeId<test_fixture>()) \
-  {                                                           \
-    gpu_test_f_##test_fixture##_##test_name();               \
-  }                                                           \
+#define GPU_TEST_F(test_fixture, test_name)                                    \
+  static void gpu_test_f_##test_fixture##_##test_name();                       \
+  GTEST_TEST_(test_fixture, test_name, test_fixture,                           \
+              ::testing::internal::GetTypeId<test_fixture>())                  \
+  {                                                                            \
+    gpu_test_f_##test_fixture##_##test_name();                                 \
+  }                                                                            \
   static void gpu_test_f_##test_fixture##_##test_name()
 
-#define GPU_TEST_P(test_case_name, test_name)                               \
-  template <typename Invocable>                                              \
-  static void gtest_gpu_##test_case_name##_##test_name(Invocable &&);       \
-  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name)                    \
-      : public test_case_name                                                \
-  {                                                                          \
-  public:                                                                    \
-    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}                   \
-    virtual void TestBody()                                                  \
-    {                                                                        \
-      gtest_gpu_##test_case_name##_##test_name([&] { return GetParam(); }); \
-    }                                                                        \
-                                                                             \
-  private:                                                                   \
-    static int AddToRegistry()                                               \
-    {                                                                        \
-      ::testing::UnitTest::GetInstance()                                     \
-          ->parameterized_test_registry()                                    \
-          .GetTestCasePatternHolder<test_case_name>(                         \
-              #test_case_name,                                               \
-              ::testing::internal::CodeLocation(__FILE__, __LINE__))         \
-          ->AddTestPattern(                                                  \
-              #test_case_name,                                               \
-              #test_name,                                                    \
-              new ::testing::internal::TestMetaFactory<                      \
-                  GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>());     \
-      return 0;                                                              \
-    }                                                                        \
-    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_;             \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_case_name,   \
-                                                           test_name));      \
-  };                                                                         \
-  int GTEST_TEST_CLASS_NAME_(test_case_name,                                 \
-                             test_name)::gtest_registering_dummy_ =          \
-      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry();    \
-  template <typename Invocable>                                              \
-  static void gtest_gpu_##test_case_name##_##test_name(Invocable &&GetParam)
-
-#define GPU_TYPED_TEST_P(SuiteName, TestName)                           \
-    namespace GTEST_SUITE_NAMESPACE_(SuiteName) {                       \
-      template <typename gtest_TypeParam_>                              \
-      class TestName : public SuiteName<gtest_TypeParam_> {             \
-       private:                                                         \
-        typedef SuiteName<gtest_TypeParam_> TestFixture;                \
-        typedef gtest_TypeParam_ TypeParam;                             \
-       public:                                                          \
-        void TestBody() override;                                       \
-      };                                                                \
-      static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \
-          GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).AddTestName(       \
-              __FILE__, __LINE__, GTEST_STRINGIFY_(SuiteName),          \
-              GTEST_STRINGIFY_(TestName));                              \
-    }                                                                   \
-    template <typename gtest_TypeParam_>                                \
-    void GTEST_SUITE_NAMESPACE_(                                        \
-        SuiteName)::TestName<gtest_TypeParam_>::TestBody()
+#define GPU_TEST_P(test_case_name, test_name)                                  \
+  template <typename Invocable>                                                \
+  static void gtest_gpu_##test_case_name##_##test_name(Invocable&&);           \
+  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name)                      \
+      : public test_case_name                                                  \
+  {                                                                            \
+  public:                                                                      \
+    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}                     \
+    virtual void TestBody()                                                    \
+    {                                                                          \
+      gtest_gpu_##test_case_name##_##test_name([&] { return GetParam(); });    \
+    }                                                                          \
+                                                                               \
+  private:                                                                     \
+    static int AddToRegistry()                                                 \
+    {                                                                          \
+      ::testing::UnitTest::GetInstance()                                       \
+          ->parameterized_test_registry()                                      \
+          .GetTestCasePatternHolder<test_case_name>(                           \
+              #test_case_name,                                                 \
+              ::testing::internal::CodeLocation(__FILE__, __LINE__))           \
+          ->AddTestPattern(                                                    \
+              #test_case_name, #test_name,                                     \
+              new ::testing::internal::TestMetaFactory<GTEST_TEST_CLASS_NAME_( \
+                  test_case_name, test_name)>());                              \
+      return 0;                                                                \
+    }                                                                          \
+    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_;               \
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_case_name,     \
+                                                           test_name));        \
+  };                                                                           \
+  int GTEST_TEST_CLASS_NAME_(test_case_name,                                   \
+                             test_name)::gtest_registering_dummy_ =            \
+      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry();      \
+  template <typename Invocable>                                                \
+  static void gtest_gpu_##test_case_name##_##test_name(Invocable&& GetParam)
+
+#define GPU_TYPED_TEST_P(SuiteName, TestName)                                  \
+  namespace GTEST_SUITE_NAMESPACE_(SuiteName)                                  \
+  {                                                                            \
+  template <typename gtest_TypeParam_>                                         \
+  class TestName : public SuiteName<gtest_TypeParam_>                          \
+  {                                                                            \
+  private:                                                                     \
+    typedef SuiteName<gtest_TypeParam_> TestFixture;                           \
+    typedef gtest_TypeParam_ TypeParam;                                        \
+                                                                               \
+  public:                                                                      \
+    void TestBody() override;                                                  \
+  };                                                                           \
+  static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ =            \
+      GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).AddTestName(                  \
+          __FILE__,                                                            \
+          __LINE__,                                                            \
+          GTEST_STRINGIFY_(SuiteName),                                         \
+          GTEST_STRINGIFY_(TestName));                                         \
+  }                                                                            \
+  template <typename gtest_TypeParam_>                                         \
+  void GTEST_SUITE_NAMESPACE_(                                                 \
+      SuiteName)::TestName<gtest_TypeParam_>::TestBody()
 
 
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( default : 4244 )  // reenable warning
-#pragma warning( default : 4389 )  // reenable warning
+#pragma warning(default : 4244)  // reenable warning
+#pragma warning(default : 4389)  // reenable warning
 #endif
 
 
 #if defined(__CUDA_ARCH__)
 
-#define RAJA_ASSERT_EQ(X,Y) \
-{\
-  auto x = (X); \
-  auto y = (Y); \
-  if(x != y){ \
-      asm("trap;"); \
-  } \
-}
+#define RAJA_ASSERT_EQ(X, Y)                                                   \
+  {                                                                            \
+    auto x = (X);                                                              \
+    auto y = (Y);                                                              \
+    if (x != y)                                                                \
+    {                                                                          \
+      asm("trap;");                                                            \
+    }                                                                          \
+  }
 
-#define RAJA_ASSERT_FLOAT_EQ(X,Y) {RAJA_ASSERT_EQ(X,Y);}
-#define RAJA_ASSERT_DOUBLE_EQ(X,Y) {RAJA_ASSERT_EQ(X,Y);}
+#define RAJA_ASSERT_FLOAT_EQ(X, Y)                                             \
+  {                                                                            \
+    RAJA_ASSERT_EQ(X, Y);                                                      \
+  }
+#define RAJA_ASSERT_DOUBLE_EQ(X, Y)                                            \
+  {                                                                            \
+    RAJA_ASSERT_EQ(X, Y);                                                      \
+  }
 #else
 
-#define RAJA_ASSERT_EQ(X,Y) {ASSERT_EQ(X,Y);}
-#define RAJA_ASSERT_FLOAT_EQ(X,Y) {ASSERT_FLOAT_EQ(X,Y);}
-#define RAJA_ASSERT_DOUBLE_EQ(X,Y) {ASSERT_DOUBLE_EQ(X,Y);}
+#define RAJA_ASSERT_EQ(X, Y)                                                   \
+  {                                                                            \
+    ASSERT_EQ(X, Y);                                                           \
+  }
+#define RAJA_ASSERT_FLOAT_EQ(X, Y)                                             \
+  {                                                                            \
+    ASSERT_FLOAT_EQ(X, Y);                                                     \
+  }
+#define RAJA_ASSERT_DOUBLE_EQ(X, Y)                                            \
+  {                                                                            \
+    ASSERT_DOUBLE_EQ(X, Y);                                                    \
+  }
 
 #endif
 /*
@@ -139,54 +157,68 @@
  *  Now you can just say ASSERT_SCALAR_EQ(X, Y) and things should just work
  *
  */
-#define ASSERT_SCALAR_EQ(X,Y) { \
-  int value_type = RAJA::gtest::getScalarType(X); \
-  switch(value_type){ \
-    case 1: {RAJA_ASSERT_FLOAT_EQ(X,Y);} break; \
-    case 2: {RAJA_ASSERT_DOUBLE_EQ(X,Y);} break; \
-    default: {RAJA_ASSERT_EQ(X,Y);} \
-  }; }
+#define ASSERT_SCALAR_EQ(X, Y)                                                 \
+  {                                                                            \
+    int value_type = RAJA::gtest::getScalarType(X);                            \
+    switch (value_type)                                                        \
+    {                                                                          \
+    case 1:                                                                    \
+    {                                                                          \
+      RAJA_ASSERT_FLOAT_EQ(X, Y);                                              \
+    }                                                                          \
+    break;                                                                     \
+    case 2:                                                                    \
+    {                                                                          \
+      RAJA_ASSERT_DOUBLE_EQ(X, Y);                                             \
+    }                                                                          \
+    break;                                                                     \
+    default:                                                                   \
+    {                                                                          \
+      RAJA_ASSERT_EQ(X, Y);                                                    \
+    }                                                                          \
+    };                                                                         \
+  }
 
 // Traits use by the above maco
 namespace RAJA
 {
-  namespace gtest
-  {
-    template<typename T>
-    struct AssertScalarTraits{
-        static constexpr int value = 0;
-    };
-
-    template<>
-    struct AssertScalarTraits<float>{
-        static constexpr int value = 1;
-    };
-
-    template<>
-    struct AssertScalarTraits<double>{
-        static constexpr int value = 2;
-    };
-
-    template<typename T>
-    inline
-    constexpr
-    int getScalarType(T const &){
-      return AssertScalarTraits<T>::value;
-    }
+namespace gtest
+{
+template <typename T>
+struct AssertScalarTraits
+{
+  static constexpr int value = 0;
+};
 
+template <>
+struct AssertScalarTraits<float>
+{
+  static constexpr int value = 1;
+};
 
-  }
+template <>
+struct AssertScalarTraits<double>
+{
+  static constexpr int value = 2;
+};
+
+template <typename T>
+inline constexpr int getScalarType(T const&)
+{
+  return AssertScalarTraits<T>::value;
 }
 
+
+}  // namespace gtest
+}  // namespace RAJA
+
 // This always returns a 0, but forces compiler not to compile-out
 // constant values
-#define NO_OPT_ZERO (rand()/RAND_MAX)
+#define NO_OPT_ZERO (rand() / RAND_MAX)
 
 // Returns a random value between 1.0 and 2.0, and helps force the compiler
 // to not compile-out constant values
-#define NO_OPT_RAND (1.0+(double)rand()/RAND_MAX)
-
-
+#define NO_OPT_RAND (1.0 + (double)rand() / RAND_MAX)
 
 
 #endif  // closing endif for header file include guard
diff --git a/test/include/RAJA_test-abs.hpp b/test/include/RAJA_test-abs.hpp
index 85b5002d92..57bfadf0c0 100644
--- a/test/include/RAJA_test-abs.hpp
+++ b/test/include/RAJA_test-abs.hpp
@@ -13,20 +13,21 @@
 
 #include <cmath>
 
-namespace RAJA {
+namespace RAJA
+{
 
-  template<typename T>
-  camp::concepts::enable_if_t<T, std::is_floating_point<T> >
-  test_abs(T&& val) {
-    return std::fabs(val);
-  } 
+template <typename T>
+camp::concepts::enable_if_t<T, std::is_floating_point<T>> test_abs(T&& val)
+{
+  return std::fabs(val);
+}
 
-  template<typename T>
-  camp::concepts::enable_if_t<T, std::is_integral<T> >
-  test_abs(T&& val) {
-    return std::abs(val);
-  }
+template <typename T>
+camp::concepts::enable_if_t<T, std::is_integral<T>> test_abs(T&& val)
+{
+  return std::abs(val);
+}
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // __RAJA_test_abs_HPP__
+#endif  // __RAJA_test_abs_HPP__
diff --git a/test/include/RAJA_test-atomic-ref-types.hpp b/test/include/RAJA_test-atomic-ref-types.hpp
index f854932ab8..2f6280ed0e 100644
--- a/test/include/RAJA_test-atomic-ref-types.hpp
+++ b/test/include/RAJA_test-atomic-ref-types.hpp
@@ -18,74 +18,71 @@
 
 #include <type_traits>
 
-template < typename T >
-RAJA_INLINE
-RAJA_HOST_DEVICE
-typename std::enable_if<sizeof(T) == 1, T>::type np2m1(T val)
+template <typename T>
+RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 1, T>::type
+np2m1(T val)
 {
-  val |= val >> 1  ;
-  val |= val >> 2  ;
-  val |= val >> 4  ;
+  val |= val >> 1;
+  val |= val >> 2;
+  val |= val >> 4;
   return val;
 }
 
-template < typename T >
-RAJA_INLINE
-RAJA_HOST_DEVICE
-typename std::enable_if<sizeof(T) == 2, T>::type np2m1(T val)
+template <typename T>
+RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 2, T>::type
+np2m1(T val)
 {
-  val |= val >> 1  ;
-  val |= val >> 2  ;
-  val |= val >> 4  ;
-  val |= val >> 8  ;
+  val |= val >> 1;
+  val |= val >> 2;
+  val |= val >> 4;
+  val |= val >> 8;
   return val;
 }
 
-template < typename T >
-RAJA_INLINE
-RAJA_HOST_DEVICE
-typename std::enable_if<sizeof(T) == 4, T>::type np2m1(T val)
+template <typename T>
+RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 4, T>::type
+np2m1(T val)
 {
-  val |= val >> 1  ;
-  val |= val >> 2  ;
-  val |= val >> 4  ;
-  val |= val >> 8  ;
-  val |= val >> 16 ;
+  val |= val >> 1;
+  val |= val >> 2;
+  val |= val >> 4;
+  val |= val >> 8;
+  val |= val >> 16;
   return val;
 }
 
-template < typename T >
-RAJA_INLINE
-RAJA_HOST_DEVICE
-typename std::enable_if<sizeof(T) == 8, T>::type np2m1(T val)
+template <typename T>
+RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 8, T>::type
+np2m1(T val)
 {
-  val |= val >> 1  ;
-  val |= val >> 2  ;
-  val |= val >> 4  ;
-  val |= val >> 8  ;
-  val |= val >> 16 ;
-  val |= val >> 32 ;
+  val |= val >> 1;
+  val |= val >> 2;
+  val |= val >> 4;
+  val |= val >> 8;
+  val |= val >> 16;
+  val |= val >> 32;
   return val;
 }
 
-template < typename T >
-RAJA_INLINE
-RAJA_HOST_DEVICE
-typename std::enable_if<sizeof(T) == 16, T>::type np2m1(T val)
+template <typename T>
+RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 16, T>::type
+np2m1(T val)
 {
-  val |= val >> 1  ;
-  val |= val >> 2  ;
-  val |= val >> 4  ;
-  val |= val >> 8  ;
-  val |= val >> 16 ;
-  val |= val >> 32 ;
-  val |= val >> 64 ;
+  val |= val >> 1;
+  val |= val >> 2;
+  val |= val >> 4;
+  val |= val >> 8;
+  val |= val >> 16;
+  val |= val >> 32;
+  val |= val >> 64;
   return val;
 }
 
 // Assist return type conditional overloading of testAtomicRefLogicalOp
-struct int_op {}; // represents underlying op type = integral
-struct all_op {}; // these op types can accept integral or float
+struct int_op
+{};  // represents underlying op type = integral
+struct all_op
+{};  // these op types can accept integral or float
 
 
-#endif // __RAJA_test_atomic_ref_types_HPP__
+#endif  // __RAJA_test_atomic_ref_types_HPP__
diff --git a/test/include/RAJA_test-atomic-types.hpp b/test/include/RAJA_test-atomic-types.hpp
index 90a1be4024..5a9df0ab43 100644
--- a/test/include/RAJA_test-atomic-types.hpp
+++ b/test/include/RAJA_test-atomic-types.hpp
@@ -21,15 +21,14 @@
 //
 // Atomic data types
 //
-using AtomicDataTypeList =
-  camp::list< RAJA::Index_type,
-              int,
+using AtomicDataTypeList = camp::list<RAJA::Index_type,
+                                      int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              unsigned int,
-              long long,
-              unsigned long long,
-              float,
+                                      unsigned int,
+                                      long long,
+                                      unsigned long long,
+                                      float,
 #endif
-              double >;
+                                      double>;
 
-#endif // __RAJA_test_atomic_types_HPP__
+#endif  // __RAJA_test_atomic_types_HPP__
diff --git a/test/include/RAJA_test-atomicpol.hpp b/test/include/RAJA_test-atomicpol.hpp
index cc327d434d..c13e9a68bd 100644
--- a/test/include/RAJA_test-atomicpol.hpp
+++ b/test/include/RAJA_test-atomicpol.hpp
@@ -11,93 +11,83 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-using SequentialAtomicPols =
-  camp::list<
+using SequentialAtomicPols = camp::list<
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              RAJA::auto_atomic,
-              RAJA::builtin_atomic,
+    RAJA::auto_atomic,
+    RAJA::builtin_atomic,
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-              RAJA::cuda_atomic_explicit<RAJA::seq_atomic>,
+    RAJA::cuda_atomic_explicit<RAJA::seq_atomic>,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              RAJA::cuda_atomic_explicit<RAJA::builtin_atomic>,
+    RAJA::cuda_atomic_explicit<RAJA::builtin_atomic>,
 #endif
 #endif
 #if defined(RAJA_ENABLE_HIP)
-              RAJA::hip_atomic_explicit<RAJA::seq_atomic>,
+    RAJA::hip_atomic_explicit<RAJA::seq_atomic>,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              RAJA::hip_atomic_explicit<RAJA::builtin_atomic>,
+    RAJA::hip_atomic_explicit<RAJA::builtin_atomic>,
 #endif
 #endif
-              RAJA::seq_atomic
-            >;
+    RAJA::seq_atomic>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPAtomicPols =
-  camp::list<
+using OpenMPAtomicPols = camp::list<
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              RAJA::omp_atomic,
-              RAJA::builtin_atomic,
+    RAJA::omp_atomic,
+    RAJA::builtin_atomic,
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-              RAJA::cuda_atomic_explicit<RAJA::omp_atomic>,
+    RAJA::cuda_atomic_explicit<RAJA::omp_atomic>,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              RAJA::cuda_atomic_explicit<RAJA::builtin_atomic>,
+    RAJA::cuda_atomic_explicit<RAJA::builtin_atomic>,
 #endif
 #endif
 #if defined(RAJA_ENABLE_HIP)
-              RAJA::hip_atomic_explicit<RAJA::omp_atomic>,
+    RAJA::hip_atomic_explicit<RAJA::omp_atomic>,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              RAJA::hip_atomic_explicit<RAJA::builtin_atomic>,
+    RAJA::hip_atomic_explicit<RAJA::builtin_atomic>,
 #endif
 #endif
-              RAJA::auto_atomic
-            >;
+    RAJA::auto_atomic>;
 #endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaAtomicPols =
-  camp::list<
+using CudaAtomicPols = camp::list<
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              RAJA::auto_atomic,
-              RAJA::cuda_atomic_explicit<RAJA::seq_atomic>,
-              RAJA::cuda_atomic_explicit<RAJA::builtin_atomic>,
+    RAJA::auto_atomic,
+    RAJA::cuda_atomic_explicit<RAJA::seq_atomic>,
+    RAJA::cuda_atomic_explicit<RAJA::builtin_atomic>,
 #if defined(RAJA_ENABLE_OPENMP)
-              RAJA::cuda_atomic_explicit<RAJA::omp_atomic>,
+    RAJA::cuda_atomic_explicit<RAJA::omp_atomic>,
 #endif
 #endif
-              RAJA::cuda_atomic
-            >;
+    RAJA::cuda_atomic>;
 #endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
-using HipAtomicPols =
-  camp::list<
+using HipAtomicPols = camp::list<
 #if defined(RAJA_TEST_EXHAUSTIVE)
-               RAJA::auto_atomic,
-               RAJA::hip_atomic_explicit<RAJA::seq_atomic>,
-               RAJA::hip_atomic_explicit<RAJA::builtin_atomic>,
+    RAJA::auto_atomic,
+    RAJA::hip_atomic_explicit<RAJA::seq_atomic>,
+    RAJA::hip_atomic_explicit<RAJA::builtin_atomic>,
 #if defined(RAJA_ENABLE_OPENMP)
-               RAJA::hip_atomic_explicit<RAJA::omp_atomic>,
+    RAJA::hip_atomic_explicit<RAJA::omp_atomic>,
 #endif
 #endif
-               RAJA::hip_atomic
-            >;
+    RAJA::hip_atomic>;
 #endif  // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
-using SyclAtomicPols =
-  camp::list<
+using SyclAtomicPols = camp::list<
 #if defined(RAJA_TEST_EXHAUSTIVE)
-               RAJA::auto_atomic,
-               RAJA::sycl_atomic_explicit<RAJA::seq_atomic>,
-               RAJA::sycl_atomic_explicit<RAJA::builtin_atomic>,
+    RAJA::auto_atomic,
+    RAJA::sycl_atomic_explicit<RAJA::seq_atomic>,
+    RAJA::sycl_atomic_explicit<RAJA::builtin_atomic>,
 #if defined(RAJA_ENABLE_OPENMP)
-               RAJA::sycl_atomic_explicit<RAJA::omp_atomic>,
+    RAJA::sycl_atomic_explicit<RAJA::omp_atomic>,
 #endif
 #endif
-               RAJA::sycl_atomic
-            >;
+    RAJA::sycl_atomic>;
 #endif  // RAJA_ENABLE_SYCL
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
diff --git a/test/include/RAJA_test-base.hpp b/test/include/RAJA_test-base.hpp
index 98bf53e1c2..470ddb9cfd 100644
--- a/test/include/RAJA_test-base.hpp
+++ b/test/include/RAJA_test-base.hpp
@@ -23,9 +23,10 @@ template <class T>
 struct Test;
 
 template <class... T>
-struct Test<camp::list<T...>> {
+struct Test<camp::list<T...>>
+{
   using Types = ::testing::Types<T...>;
 };
 
 
-#endif // __RAJA_test_base_HPP__
+#endif  // __RAJA_test_base_HPP__
diff --git a/test/include/RAJA_test-camp.hpp b/test/include/RAJA_test-camp.hpp
index 45e125d92a..a9959f3c73 100644
--- a/test/include/RAJA_test-camp.hpp
+++ b/test/include/RAJA_test-camp.hpp
@@ -42,4 +42,4 @@ using HipResourceList = camp::list<camp::resources::Hip>;
 using SyclResourceList = camp::list<camp::resources::Sycl>;
 #endif
 
-#endif // __RAJA_test_camp_HPP__
+#endif  // __RAJA_test_camp_HPP__
diff --git a/test/include/RAJA_test-dynamic-forall.hpp b/test/include/RAJA_test-dynamic-forall.hpp
index 0185061a6d..9988492216 100644
--- a/test/include/RAJA_test-dynamic-forall.hpp
+++ b/test/include/RAJA_test-dynamic-forall.hpp
@@ -15,18 +15,21 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-using policy_list = camp::list<camp::list<RAJA::seq_exec
-                               ,RAJA::simd_exec
+using policy_list = camp::list<camp::list<RAJA::seq_exec,
+                                          RAJA::simd_exec
 #if defined(RAJA_ENABLE_OPENMP)
-                               ,RAJA::omp_parallel_for_exec
+                                          ,
+                                          RAJA::omp_parallel_for_exec
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-                               ,RAJA::cuda_exec<256>
-                               ,RAJA::cuda_exec<512>
+                                          ,
+                                          RAJA::cuda_exec<256>,
+                                          RAJA::cuda_exec<512>
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                               ,RAJA::hip_exec<256>
-                               ,RAJA::hip_exec<512>
+                                          ,
+                                          RAJA::hip_exec<256>,
+                                          RAJA::hip_exec<512>
 #endif
                                           >>;
 
diff --git a/test/include/RAJA_test-forall-async-execpol.hpp b/test/include/RAJA_test-forall-async-execpol.hpp
index fa9526476e..587f816476 100644
--- a/test/include/RAJA_test-forall-async-execpol.hpp
+++ b/test/include/RAJA_test-forall-async-execpol.hpp
@@ -18,29 +18,30 @@
 #include "RAJA_test-forall-execpol.hpp"
 
 // Sequential execution policy types
-using SequentialAsyncForallExecPols = SequentialForallExecPols;
+using SequentialAsyncForallExecPols       = SequentialForallExecPols;
 using SequentialAsyncForallReduceExecPols = SequentialForallReduceExecPols;
 using SequentialAsyncForallAtomicExecPols = SequentialForallAtomicExecPols;
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-using OpenMPAsyncForallExecPols = OpenMPForallExecPols;
+using OpenMPAsyncForallExecPols       = OpenMPForallExecPols;
 using OpenMPAsyncForallReduceExecPols = OpenMPForallReduceExecPols;
 using OpenMPAsyncForallAtomicExecPols = OpenMPForallAtomicExecPols;
 
 #endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetAsyncForallExecPols = OpenMPTargetForallExecPols;
+using OpenMPTargetAsyncForallExecPols       = OpenMPTargetForallExecPols;
 using OpenMPTargetAsyncForallReduceExecPols = OpenMPTargetForallReduceExecPols;
 using OpenMPTargetAsyncForallAtomicExecPols = OpenMPTargetForallAtomicExecPols;
 
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaAsyncForallExecPols = camp::list< RAJA::cuda_exec<128, true>,
-                                       RAJA::cuda_exec<256, true>,
-                                       RAJA::cuda_exec_explicit<256,2, true> >;
+using CudaAsyncForallExecPols =
+    camp::list<RAJA::cuda_exec<128, true>,
+               RAJA::cuda_exec<256, true>,
+               RAJA::cuda_exec_explicit<256, 2, true>>;
 
 using CudaAsyncForallReduceExecPols = CudaForallExecPols;
 
@@ -49,8 +50,8 @@ using CudaAsyncForallAtomicExecPols = CudaForallExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipAsyncForallExecPols = camp::list< RAJA::hip_exec<128, true>,
-                                      RAJA::hip_exec<256, true>  >;
+using HipAsyncForallExecPols =
+    camp::list<RAJA::hip_exec<128, true>, RAJA::hip_exec<256, true>>;
 
 using HipAsyncForallReduceExecPols = HipForallExecPols;
 
diff --git a/test/include/RAJA_test-forall-data.hpp b/test/include/RAJA_test-forall-data.hpp
index 3ced1c4cf1..d932e6d94f 100644
--- a/test/include/RAJA_test-forall-data.hpp
+++ b/test/include/RAJA_test-forall-data.hpp
@@ -6,7 +6,7 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 //
-// Utility routines for allocating/deallocating arrays in for forall tests. 
+// Utility routines for allocating/deallocating arrays in for forall tests.
 //
 
 #ifndef __RAJA_test_forall_data_HPP__
@@ -14,45 +14,47 @@
 
 #include "camp/resource.hpp"
 
-template<typename T>
+template <typename T>
 void allocateForallTestData(size_t N,
                             camp::resources::Resource work_res,
                             T** work_array,
                             T** check_array,
                             T** test_array)
 {
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
   *work_array = work_res.allocate<T>(RAJA::stripIndexType(N));
 
   *check_array = host_res.allocate<T>(RAJA::stripIndexType(N));
-  *test_array = host_res.allocate<T>(RAJA::stripIndexType(N));
+  *test_array  = host_res.allocate<T>(RAJA::stripIndexType(N));
 }
 
 // for RAJA strongly typed indices
-template<typename T,
-         typename std::enable_if<std::is_base_of<RAJA::IndexValueBase, camp::type::ptr::rem<T>>::value>::type* = nullptr>
+template <typename T,
+          typename std::enable_if<
+              std::is_base_of<RAJA::IndexValueBase,
+                              camp::type::ptr::rem<T>>::value>::type* = nullptr>
 void allocateForallTestData(T N,
                             camp::resources::Resource work_res,
                             T** work_array,
                             T** check_array,
                             T** test_array)
 {
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
   *work_array = work_res.allocate<T>(RAJA::stripIndexType(N));
 
   *check_array = host_res.allocate<T>(RAJA::stripIndexType(N));
-  *test_array = host_res.allocate<T>(RAJA::stripIndexType(N));
+  *test_array  = host_res.allocate<T>(RAJA::stripIndexType(N));
 }
 
-template<typename T>
+template <typename T>
 void deallocateForallTestData(camp::resources::Resource work_res,
                               T* work_array,
                               T* check_array,
                               T* test_array)
 {
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
   work_res.deallocate(work_array);
 
@@ -60,4 +62,4 @@ void deallocateForallTestData(camp::resources::Resource work_res,
   host_res.deallocate(test_array);
 }
 
-#endif // __RAJA_test_forall_data_HPP__
+#endif  // __RAJA_test_forall_data_HPP__
diff --git a/test/include/RAJA_test-forall-execpol.hpp b/test/include/RAJA_test-forall-execpol.hpp
index 40adaccc8c..cc8f9b2a26 100644
--- a/test/include/RAJA_test-forall-execpol.hpp
+++ b/test/include/RAJA_test-forall-execpol.hpp
@@ -16,87 +16,109 @@
 #include "camp/list.hpp"
 
 // Sequential execution policy types
-using SequentialForallExecPols = camp::list< RAJA::seq_exec,
-                                             RAJA::simd_exec >;
+using SequentialForallExecPols = camp::list<RAJA::seq_exec, RAJA::simd_exec>;
 
 //
 // Sequential execution policy types for reduction and atomic tests.
 //
 // Note: RAJA::simd_exec does not work with these.
 //
-using SequentialForallReduceExecPols = camp::list< RAJA::seq_exec >;
+using SequentialForallReduceExecPols = camp::list<RAJA::seq_exec>;
 
-using SequentialForallAtomicExecPols = camp::list< RAJA::seq_exec >;
+using SequentialForallAtomicExecPols = camp::list<RAJA::seq_exec>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPForallExecPols = 
-  camp::list< RAJA::omp_parallel_for_exec
- 
-              , RAJA::omp_parallel_for_static_exec< >
-              , RAJA::omp_parallel_for_static_exec<4>
+using OpenMPForallExecPols = camp::list<
+    RAJA::omp_parallel_for_exec
 
-#if defined(RAJA_TEST_EXHAUSTIVE)
-              , RAJA::omp_parallel_for_dynamic_exec< >
-              , RAJA::omp_parallel_for_dynamic_exec<4>
-
-              , RAJA::omp_parallel_for_guided_exec< >
-              , RAJA::omp_parallel_for_guided_exec<4>
-
-              , RAJA::omp_parallel_for_runtime_exec
-
-              , RAJA::omp_parallel_exec<RAJA::omp_for_exec>
-
-              , RAJA::omp_parallel_exec<RAJA::omp_for_static_exec< >>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Static< >>>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_static_exec<8>>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Static<8>>>
-
-              , RAJA::omp_parallel_exec<RAJA::omp_for_nowait_schedule_exec<RAJA::policy::omp::Static< >>>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec<4>>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_nowait_schedule_exec<RAJA::policy::omp::Static<4>>>
-
-              , RAJA::omp_parallel_exec<RAJA::omp_for_dynamic_exec< >>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Dynamic< >>>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_dynamic_exec<8>>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Dynamic<8>>>
+    ,
+    RAJA::omp_parallel_for_static_exec<>,
+    RAJA::omp_parallel_for_static_exec<4>
 
-              , RAJA::omp_parallel_exec<RAJA::omp_for_guided_exec< >>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Guided< >>>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_guided_exec<8>>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Guided<8>>>
-
-              , RAJA::omp_parallel_exec<RAJA::omp_for_runtime_exec>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Runtime>>
-#endif       
-             >;
+#if defined(RAJA_TEST_EXHAUSTIVE)
+    ,
+    RAJA::omp_parallel_for_dynamic_exec<>,
+    RAJA::omp_parallel_for_dynamic_exec<4>
+
+    ,
+    RAJA::omp_parallel_for_guided_exec<>,
+    RAJA::omp_parallel_for_guided_exec<4>
+
+    ,
+    RAJA::omp_parallel_for_runtime_exec
+
+    ,
+    RAJA::omp_parallel_exec<RAJA::omp_for_exec>
+
+    ,
+    RAJA::omp_parallel_exec<RAJA::omp_for_static_exec<>>,
+    RAJA::omp_parallel_exec<
+        RAJA::omp_for_schedule_exec<RAJA::policy::omp::Static<>>>,
+    RAJA::omp_parallel_exec<RAJA::omp_for_static_exec<8>>,
+    RAJA::omp_parallel_exec<
+        RAJA::omp_for_schedule_exec<RAJA::policy::omp::Static<8>>>
+
+    ,
+    RAJA::omp_parallel_exec<
+        RAJA::omp_for_nowait_schedule_exec<RAJA::policy::omp::Static<>>>,
+    RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec<4>>,
+    RAJA::omp_parallel_exec<
+        RAJA::omp_for_nowait_schedule_exec<RAJA::policy::omp::Static<4>>>
+
+    ,
+    RAJA::omp_parallel_exec<RAJA::omp_for_dynamic_exec<>>,
+    RAJA::omp_parallel_exec<
+        RAJA::omp_for_schedule_exec<RAJA::policy::omp::Dynamic<>>>,
+    RAJA::omp_parallel_exec<RAJA::omp_for_dynamic_exec<8>>,
+    RAJA::omp_parallel_exec<
+        RAJA::omp_for_schedule_exec<RAJA::policy::omp::Dynamic<8>>>
+
+    ,
+    RAJA::omp_parallel_exec<RAJA::omp_for_guided_exec<>>,
+    RAJA::omp_parallel_exec<
+        RAJA::omp_for_schedule_exec<RAJA::policy::omp::Guided<>>>,
+    RAJA::omp_parallel_exec<RAJA::omp_for_guided_exec<8>>,
+    RAJA::omp_parallel_exec<
+        RAJA::omp_for_schedule_exec<RAJA::policy::omp::Guided<8>>>
+
+    ,
+    RAJA::omp_parallel_exec<RAJA::omp_for_runtime_exec>,
+    RAJA::omp_parallel_exec<
+        RAJA::omp_for_schedule_exec<RAJA::policy::omp::Runtime>>
+#endif
+    >;
 
 using OpenMPForallReduceExecPols = OpenMPForallExecPols;
 
 using OpenMPForallAtomicExecPols =
-  camp::list< RAJA::omp_parallel_for_exec
+    camp::list<RAJA::omp_parallel_for_exec
 
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              , RAJA::omp_parallel_for_static_exec< >
-              , RAJA::omp_parallel_for_static_exec<4>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec< >>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec<4>>
-
-              , RAJA::omp_parallel_for_dynamic_exec< >
-              , RAJA::omp_parallel_for_dynamic_exec<2>
-
-              , RAJA::omp_parallel_for_guided_exec< >
-              , RAJA::omp_parallel_for_guided_exec<3>
-
-              , RAJA::omp_parallel_for_runtime_exec
+               ,
+               RAJA::omp_parallel_for_static_exec<>,
+               RAJA::omp_parallel_for_static_exec<4>,
+               RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec<>>,
+               RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec<4>>
+
+               ,
+               RAJA::omp_parallel_for_dynamic_exec<>,
+               RAJA::omp_parallel_for_dynamic_exec<2>
+
+               ,
+               RAJA::omp_parallel_for_guided_exec<>,
+               RAJA::omp_parallel_for_guided_exec<3>
+
+               ,
+               RAJA::omp_parallel_for_runtime_exec
 #endif
-            >; 
+               >;
 
 #endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 using OpenMPTargetForallExecPols =
-  camp::list< RAJA::omp_target_parallel_for_exec<8>,
-              RAJA::omp_target_parallel_for_exec_nt >;
+    camp::list<RAJA::omp_target_parallel_for_exec<8>,
+               RAJA::omp_target_parallel_for_exec_nt>;
 
 using OpenMPTargetForallReduceExecPols = OpenMPTargetForallExecPols;
 
@@ -105,12 +127,15 @@ using OpenMPTargetForallAtomicExecPols = OpenMPTargetForallExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaForallExecPols = camp::list< RAJA::cuda_exec<128>,
-                                       RAJA::cuda_exec_occ_calc<256>,
-                                       RAJA::cuda_exec_grid<256, 64>,
-                                       RAJA::cuda_exec_explicit<256,2>,
-                                       RAJA::cuda_exec_occ_fraction<256, RAJA::Fraction<size_t,1,2>>,
-                                       RAJA::cuda_exec_occ_custom<256, RAJA::CudaAvoidDeviceMaxThreadOccupancyConcretizer> >;
+using CudaForallExecPols =
+    camp::list<RAJA::cuda_exec<128>,
+               RAJA::cuda_exec_occ_calc<256>,
+               RAJA::cuda_exec_grid<256, 64>,
+               RAJA::cuda_exec_explicit<256, 2>,
+               RAJA::cuda_exec_occ_fraction<256, RAJA::Fraction<size_t, 1, 2>>,
+               RAJA::cuda_exec_occ_custom<
+                   256,
+                   RAJA::CudaAvoidDeviceMaxThreadOccupancyConcretizer>>;
 
 using CudaForallReduceExecPols = CudaForallExecPols;
 
@@ -119,11 +144,14 @@ using CudaForallAtomicExecPols = CudaForallExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipForallExecPols = camp::list< RAJA::hip_exec<128>,
-                                      RAJA::hip_exec_occ_calc<256>,
-                                      RAJA::hip_exec_grid<256, 64>,
-                                      RAJA::hip_exec_occ_fraction<256, RAJA::Fraction<size_t,1,2>>,
-                                      RAJA::hip_exec_occ_custom<256, RAJA::HipAvoidDeviceMaxThreadOccupancyConcretizer> >;
+using HipForallExecPols =
+    camp::list<RAJA::hip_exec<128>,
+               RAJA::hip_exec_occ_calc<256>,
+               RAJA::hip_exec_grid<256, 64>,
+               RAJA::hip_exec_occ_fraction<256, RAJA::Fraction<size_t, 1, 2>>,
+               RAJA::hip_exec_occ_custom<
+                   256,
+                   RAJA::HipAvoidDeviceMaxThreadOccupancyConcretizer>>;
 
 using HipForallReduceExecPols = HipForallExecPols;
 
@@ -132,8 +160,8 @@ using HipForallAtomicExecPols = HipForallExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_SYCL)
-using SyclForallExecPols = camp::list< RAJA::sycl_exec<128, false>,
-                                       RAJA::sycl_exec<256, false> >;
+using SyclForallExecPols =
+    camp::list<RAJA::sycl_exec<128, false>, RAJA::sycl_exec<256, false>>;
 
 using SyclForallReduceExecPols = SyclForallExecPols;
 
diff --git a/test/include/RAJA_test-forall-indexset-execpol.hpp b/test/include/RAJA_test-forall-indexset-execpol.hpp
index 1a25ba4daf..e4eebcd266 100644
--- a/test/include/RAJA_test-forall-indexset-execpol.hpp
+++ b/test/include/RAJA_test-forall-indexset-execpol.hpp
@@ -13,8 +13,8 @@
 
 // Sequential execution policy types
 using SequentialForallIndexSetExecPols =
-  camp::list< RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec>,
-              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::simd_exec> >;
+    camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec>,
+               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::simd_exec>>;
 
 //
 // Sequential execution policy types for reduction tests.
@@ -22,50 +22,48 @@ using SequentialForallIndexSetExecPols =
 // Note: RAJA::simd_exec does not work with these.
 //
 using SequentialForallIndexSetReduceExecPols =
-  camp::list< RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec> >;
+    camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec>>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPForallIndexSetExecPols =  
-  camp::list< RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::seq_exec>,
-              RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::simd_exec>,
-              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec> >;
+using OpenMPForallIndexSetExecPols =
+    camp::list<RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::seq_exec>,
+               RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::simd_exec>,
+               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>>;
 
 using OpenMPForallIndexSetReduceExecPols =
-  camp::list< RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::seq_exec>,
-              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec> >;
+    camp::list<RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::seq_exec>,
+               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>>;
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetForallIndexSetExecPols =
-  camp::list< RAJA::ExecPolicy<RAJA::seq_segit,
-                               RAJA::omp_target_parallel_for_exec<8>>,
-              RAJA::ExecPolicy<RAJA::seq_segit, 
-                               RAJA::omp_target_parallel_for_exec_nt> >;
+using OpenMPTargetForallIndexSetExecPols = camp::list<
+    RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_target_parallel_for_exec<8>>,
+    RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_target_parallel_for_exec_nt>>;
 
-using OpenMPTargetForallIndexSetReduceExecPols = 
-      OpenMPTargetForallIndexSetExecPols;
+using OpenMPTargetForallIndexSetReduceExecPols =
+    OpenMPTargetForallIndexSetExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
 using CudaForallIndexSetExecPols =
-  camp::list< RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<128>>,
-              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<256>> >;
+    camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<128>>,
+               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<256>>>;
 
 using CudaForallIndexSetReduceExecPols = CudaForallIndexSetExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 using HipForallIndexSetExecPols =
-  camp::list< RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<128>>,
-              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<256>> >;
+    camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<128>>,
+               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<256>>>;
 
 using HipForallIndexSetReduceExecPols = HipForallIndexSetExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_SYCL)
 using SyclForallIndexSetExecPols =
-  camp::list< RAJA::ExecPolicy<RAJA::seq_segit, RAJA::sycl_exec<128>>,
-              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::sycl_exec<256>> >;
+    camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::sycl_exec<128>>,
+               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::sycl_exec<256>>>;
 
 using SyclForallIndexSetReduceExecPols = SyclForallIndexSetExecPols;
 #endif
diff --git a/test/include/RAJA_test-index-types.hpp b/test/include/RAJA_test-index-types.hpp
index 231139eb57..ed13851729 100644
--- a/test/include/RAJA_test-index-types.hpp
+++ b/test/include/RAJA_test-index-types.hpp
@@ -25,50 +25,50 @@
 //
 RAJA_INDEX_VALUE(StrongIndexType, "StrongIndexType");
 RAJA_INDEX_VALUE_T(StrongInt, int, "StrongIntType");
-RAJA_INDEX_VALUE_T(StrongULL, unsigned long long , "StrongULLType");
+RAJA_INDEX_VALUE_T(StrongULL, unsigned long long, "StrongULLType");
 
 //
 // Standard index types list
 //
-using IdxTypeList = camp::list<RAJA::Index_type,
-                               int,
+using IdxTypeList =
+    camp::list<RAJA::Index_type,
+               int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-                               unsigned int,
-// short int types will break a bunch of tests due to assumpitons made in 
-// the test implementations.
-//                             short,
-//                             unsigned short,
-                               long int,
-                               unsigned long,
-                               long long,
+               unsigned int,
+               // short int types will break a bunch of tests due to assumpitons
+               // made in the test implementations.
+               //                             short,
+               //                             unsigned short,
+               long int,
+               unsigned long,
+               long long,
 #endif
-                               unsigned long long>;
+               unsigned long long>;
 
 //
 // Signed index types list
 //
-using SignedIdxTypeList = camp::list<RAJA::Index_type,
-                                     int,
-                                     long long>;
+using SignedIdxTypeList = camp::list<RAJA::Index_type, int, long long>;
 
 //
 // Index types w/ Strong types list
 //
-using StrongIdxTypeList = camp::list<RAJA::Index_type,
-                                     int,
-                                     StrongIndexType,
+using StrongIdxTypeList =
+    camp::list<RAJA::Index_type,
+               int,
+               StrongIndexType,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-                                     //StrongInt,
-                                     unsigned int,
-// short int types will break a bunch of tests due to assumpitons made in 
-// the test implementations.
-//                                   short,
-//                                   unsigned short,
-                                     long int,
-                                     unsigned long,
-                                     long long,
+               // StrongInt,
+               unsigned int,
+               // short int types will break a bunch of tests due to assumpitons
+               // made in the test implementations.
+               //                                   short,
+               //                                   unsigned short,
+               long int,
+               unsigned long,
+               long long,
 #endif
-                                     //StrongULL,
-                                     unsigned long long>;
+               // StrongULL,
+               unsigned long long>;
 
-#endif // __RAJA_test_index_types_HPP__
+#endif  // __RAJA_test_index_types_HPP__
diff --git a/test/include/RAJA_test-indexset-build.hpp b/test/include/RAJA_test-indexset-build.hpp
index a7bcdf5b05..4bc41ac9cf 100644
--- a/test/include/RAJA_test-indexset-build.hpp
+++ b/test/include/RAJA_test-indexset-build.hpp
@@ -18,17 +18,17 @@
 #include <random>
 
 //
-// Utility routine to construct index set with mix of Range, RangeStride, 
+// Utility routine to construct index set with mix of Range, RangeStride,
 // and List segments to use in various tests.
 //
 template <typename INDEX_TYPE,
           typename RANGE_TYPE,
           typename RANGESTRIDE_TYPE,
           typename LIST_TYPE>
-void buildIndexSet( 
-  RAJA::TypedIndexSet< RANGE_TYPE, RANGESTRIDE_TYPE, LIST_TYPE >& iset, 
-  std::vector<INDEX_TYPE>& indices_out,
-  camp::resources::Resource working_res )
+void buildIndexSet(
+    RAJA::TypedIndexSet<RANGE_TYPE, RANGESTRIDE_TYPE, LIST_TYPE>& iset,
+    std::vector<INDEX_TYPE>& indices_out,
+    camp::resources::Resource working_res)
 {
   //
   //  Build vector of integers for creating List segments.
@@ -38,27 +38,29 @@ void buildIndexSet(
 
   std::vector<INDEX_TYPE> lindices;
   INDEX_TYPE idx = 0;
-  while (lindices.size() < 3000) {
+  while (lindices.size() < 3000)
+  {
     double dval = dist(gen);
-    if (dval > 0.3) {
+    if (dval > 0.3)
+    {
       lindices.push_back(idx);
     }
     idx++;
   }
 
   //
-  // Construct a mix of Range, RangeStride, and List segments 
+  // Construct a mix of Range, RangeStride, and List segments
   // and add them to index set
   //
-  INDEX_TYPE rbeg = 0;
-  INDEX_TYPE rend = 0;
-  INDEX_TYPE stride = 0;
+  INDEX_TYPE rbeg     = 0;
+  INDEX_TYPE rend     = 0;
+  INDEX_TYPE stride   = 0;
   INDEX_TYPE last_idx = 0;
-  INDEX_TYPE lseg_len = static_cast<INDEX_TYPE>( lindices.size() );
+  INDEX_TYPE lseg_len = static_cast<INDEX_TYPE>(lindices.size());
   std::vector<INDEX_TYPE> lseg(lseg_len);
   std::vector<INDEX_TYPE> lseg_vec(lseg_len);
 
-  indices_out.clear(); 
+  indices_out.clear();
 
   // Create empty Range segment
   rbeg = 1;
@@ -70,34 +72,38 @@ void buildIndexSet(
   rbeg = 1;
   rend = 1578;
   iset.push_back(RANGE_TYPE(rbeg, rend));
-  for (INDEX_TYPE i = rbeg; i < rend; ++i) { 
-    indices_out.push_back( i ); 
+  for (INDEX_TYPE i = rbeg; i < rend; ++i)
+  {
+    indices_out.push_back(i);
   }
   last_idx = rend;
 
   // Create List segment
-  for (INDEX_TYPE i = 0; i < lseg_len; ++i) {
+  for (INDEX_TYPE i = 0; i < lseg_len; ++i)
+  {
     lseg[i] = lindices[i] + last_idx + 3;
-    indices_out.push_back( lseg[i] );
+    indices_out.push_back(lseg[i]);
   }
   iset.push_back(LIST_TYPE(&lseg[0], lseg_len, working_res));
   last_idx = lseg[lseg_len - 1];
 
   // Create List segment using alternate ctor
-  for (INDEX_TYPE i = 0; i < lseg_len; ++i) {
+  for (INDEX_TYPE i = 0; i < lseg_len; ++i)
+  {
     lseg_vec[i] = lindices[i] + last_idx + 3;
-    indices_out.push_back( lseg_vec[i] );
+    indices_out.push_back(lseg_vec[i]);
   }
   iset.push_back(LIST_TYPE(lseg_vec, working_res));
   last_idx = lseg_vec[lseg_len - 1];
 
   // Create Range-stride segment
-  rbeg = last_idx + 16;
-  rend = rbeg + 2040;
+  rbeg   = last_idx + 16;
+  rend   = rbeg + 2040;
   stride = 3;
   iset.push_back(RANGESTRIDE_TYPE(rbeg, rend, stride));
-  for (INDEX_TYPE i = rbeg; i < rend; i += stride) { 
-    indices_out.push_back( i ); 
+  for (INDEX_TYPE i = rbeg; i < rend; i += stride)
+  {
+    indices_out.push_back(i);
   }
   last_idx = rend;
 
@@ -105,15 +111,17 @@ void buildIndexSet(
   rbeg = last_idx + 4;
   rend = rbeg + 2759;
   iset.push_back(RANGE_TYPE(rbeg, rend));
-  for (INDEX_TYPE i = rbeg; i < rend; ++i) { 
-    indices_out.push_back( i ); 
+  for (INDEX_TYPE i = rbeg; i < rend; ++i)
+  {
+    indices_out.push_back(i);
   }
   last_idx = rend;
 
   // Create List segment
-  for (INDEX_TYPE i = 0; i < lseg_len; ++i) {
+  for (INDEX_TYPE i = 0; i < lseg_len; ++i)
+  {
     lseg[i] = lindices[i] + last_idx + 5;
-    indices_out.push_back( lseg[i] );
+    indices_out.push_back(lseg[i]);
   }
   iset.push_back(LIST_TYPE(&lseg[0], lseg_len, working_res));
   last_idx = lseg[lseg_len - 1];
@@ -122,15 +130,17 @@ void buildIndexSet(
   rbeg = last_idx + 1;
   rend = rbeg + 320;
   iset.push_back(RANGE_TYPE(rbeg, rend));
-  for (INDEX_TYPE i = rbeg; i < rend; ++i) { 
-    indices_out.push_back( i ); 
+  for (INDEX_TYPE i = rbeg; i < rend; ++i)
+  {
+    indices_out.push_back(i);
   }
   last_idx = rend;
 
   // Create List segment using alternate ctor
-  for (INDEX_TYPE i = 0; i < lseg_len; ++i) {
+  for (INDEX_TYPE i = 0; i < lseg_len; ++i)
+  {
     lseg_vec[i] = lindices[i] + last_idx + 7;
-    indices_out.push_back( lseg_vec[i] );
+    indices_out.push_back(lseg_vec[i]);
   }
   iset.push_back(LIST_TYPE(lseg_vec, working_res));
   last_idx = lseg_vec[lseg_len - 1];
diff --git a/test/include/RAJA_test-kernel-nested-loop-types.hpp b/test/include/RAJA_test-kernel-nested-loop-types.hpp
index 4d13af1e9b..9c323c95e4 100644
--- a/test/include/RAJA_test-kernel-nested-loop-types.hpp
+++ b/test/include/RAJA_test-kernel-nested-loop-types.hpp
@@ -16,30 +16,54 @@
 #define DEVICE_KERNEL CudaKernel
 #endif
 
-struct DEPTH_1_REDUCESUM {};
-struct DEPTH_2 {};
-struct DEPTH_2_COLLAPSE {};
-struct DEPTH_3 {};
-struct DEPTH_3_COLLAPSE {};
-struct DEPTH_3_COLLAPSE_SEQ_INNER {};
-struct DEPTH_3_COLLAPSE_SEQ_OUTER {};
-struct DEPTH_3_REDUCESUM {};
-struct DEPTH_3_REDUCESUM_SEQ_INNER {};
-struct DEPTH_3_REDUCESUM_SEQ_OUTER {};
-struct DEVICE_DEPTH_1_REDUCESUM {};
-struct DEVICE_DEPTH_1_REDUCESUM_WARP {};
-struct DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE {};
-struct DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE {};
-struct DEVICE_DEPTH_2 {};
-struct DEVICE_DEPTH_2_REDUCESUM_WARP {};
-struct DEVICE_DEPTH_2_REDUCESUM_WARPMASK {};
-struct DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI {};
-struct DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE {};
-struct DEVICE_DEPTH_3 {};
-struct DEVICE_DEPTH_3_REDUCESUM {};
-struct DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER {};
-struct DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER {};
-struct DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE {};
+struct DEPTH_1_REDUCESUM
+{};
+struct DEPTH_2
+{};
+struct DEPTH_2_COLLAPSE
+{};
+struct DEPTH_3
+{};
+struct DEPTH_3_COLLAPSE
+{};
+struct DEPTH_3_COLLAPSE_SEQ_INNER
+{};
+struct DEPTH_3_COLLAPSE_SEQ_OUTER
+{};
+struct DEPTH_3_REDUCESUM
+{};
+struct DEPTH_3_REDUCESUM_SEQ_INNER
+{};
+struct DEPTH_3_REDUCESUM_SEQ_OUTER
+{};
+struct DEVICE_DEPTH_1_REDUCESUM
+{};
+struct DEVICE_DEPTH_1_REDUCESUM_WARP
+{};
+struct DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE
+{};
+struct DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE
+{};
+struct DEVICE_DEPTH_2
+{};
+struct DEVICE_DEPTH_2_REDUCESUM_WARP
+{};
+struct DEVICE_DEPTH_2_REDUCESUM_WARPMASK
+{};
+struct DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI
+{};
+struct DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE
+{};
+struct DEVICE_DEPTH_3
+{};
+struct DEVICE_DEPTH_3_REDUCESUM
+{};
+struct DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER
+{};
+struct DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER
+{};
+struct DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE
+{};
 
 
 //
@@ -47,56 +71,61 @@ struct DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE {};
 // Nested Loop Data Type information
 //
 //
-template<typename LoopPolType, typename... Policies> 
-struct NestedLoopData : camp::list<Policies...> {
+template <typename LoopPolType, typename... Policies>
+struct NestedLoopData : camp::list<Policies...>
+{
   using LoopType = LoopPolType;
 };
 
 
 //
 //
-// Filter out a list of "NestedLoopData" types given a 
+// Filter out a list of "NestedLoopData" types given a
 // tests' supported loop Type list.
 //
 //
-namespace detail{
+namespace detail
+{
 
-  using namespace camp;
+using namespace camp;
 
-  template<typename T, typename Elements>
-  struct is_in_type_list;
+template <typename T, typename Elements>
+struct is_in_type_list;
 
-  template<typename T, typename Elements>
-  struct KELB_impl;
+template <typename T, typename Elements>
+struct KELB_impl;
 
-  template<typename T, typename First, typename... Rest>
-  struct is_in_type_list<T, list<First, Rest...>> :
-    std::conditional<
-      std::is_same<  typename T::LoopType, First  >::value,
-      list<T>,
-      typename is_in_type_list<T, list<Rest...>>::type > {};
+template <typename T, typename First, typename... Rest>
+struct is_in_type_list<T, list<First, Rest...>>
+    : std::conditional<std::is_same<typename T::LoopType, First>::value,
+                       list<T>,
+                       typename is_in_type_list<T, list<Rest...>>::type>
+{};
 
-  template<typename T, typename Last>
-  struct is_in_type_list<T, list<Last>> :
-    std::conditional<
-      std::is_same< typename T::LoopType , Last>::value,
-      list<T>,
-      list<> > {};
+template <typename T, typename Last>
+struct is_in_type_list<T, list<Last>>
+    : std::conditional<std::is_same<typename T::LoopType, Last>::value,
+                       list<T>,
+                       list<>>
+{};
 
-  template<typename POL_TYPE_LIST, typename First, typename... Rest>
-  struct KELB_impl<POL_TYPE_LIST, list<First, Rest...>> :
-    join< typename KELB_impl<POL_TYPE_LIST, list<First  >>::type, 
-          typename KELB_impl<POL_TYPE_LIST, list<Rest...>>::type > {};
+template <typename POL_TYPE_LIST, typename First, typename... Rest>
+struct KELB_impl<POL_TYPE_LIST, list<First, Rest...>>
+    : join<typename KELB_impl<POL_TYPE_LIST, list<First>>::type,
+           typename KELB_impl<POL_TYPE_LIST, list<Rest...>>::type>
+{};
 
-  template<typename POL_TYPE_LIST, typename Last>
-  struct KELB_impl<POL_TYPE_LIST, list<Last>> :
-    is_in_type_list<Last, POL_TYPE_LIST > {};
+template <typename POL_TYPE_LIST, typename Last>
+struct KELB_impl<POL_TYPE_LIST, list<Last>>
+    : is_in_type_list<Last, POL_TYPE_LIST>
+{};
 
-} // namespace detail
+}  // namespace detail
 
 
-template<typename POL_TYPE_LIST, typename EXEC_POL_LIST>
-struct KernelExecListBuilder {
+template <typename POL_TYPE_LIST, typename EXEC_POL_LIST>
+struct KernelExecListBuilder
+{
   using type = typename detail::KELB_impl<POL_TYPE_LIST, EXEC_POL_LIST>::type;
 };
 
diff --git a/test/include/RAJA_test-kernel-tile-size.hpp b/test/include/RAJA_test-kernel-tile-size.hpp
index 78fa28172d..9d9bb95556 100644
--- a/test/include/RAJA_test-kernel-tile-size.hpp
+++ b/test/include/RAJA_test-kernel-tile-size.hpp
@@ -15,4 +15,4 @@
 constexpr int tile_dim_x = 16;
 constexpr int tile_dim_y = 16;
 
-#endif // __RAJA_test_kernel_tile_size_HPP__
+#endif  // __RAJA_test_kernel_tile_size_HPP__
diff --git a/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp b/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
index 7179e48fdc..0bb84ddd16 100644
--- a/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
+++ b/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
@@ -15,26 +15,17 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-//Launch policies
-using seq_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>
-            >;
-
-using Sequential_launch_policies =
-  camp::list<
-             seq_policies
-            >;
+// Launch policies
+using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
+
+using Sequential_launch_policies = camp::list<seq_policies>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using omp_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-             RAJA::LoopPolicy<RAJA::omp_for_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>
-            >;
+using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+                                RAJA::LoopPolicy<RAJA::omp_for_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
 
 using OpenMP_launch_policies = camp::list<omp_policies>;
 
@@ -43,47 +34,36 @@ using OpenMP_launch_policies = camp::list<omp_policies>;
 #if defined(RAJA_ENABLE_CUDA)
 
 using cuda_direct_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
-             RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>
-            >;
-
-using cuda_direct_explicit_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
-             RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>
-           >;
+    camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
+               RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>>;
+
+using cuda_direct_explicit_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+    RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>>;
 
 using Cuda_launch_policies =
-  camp::list<
-             cuda_direct_policies,
-             cuda_direct_explicit_policies
-            >;
+    camp::list<cuda_direct_policies, cuda_direct_explicit_policies>;
 #endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
 using hip_direct_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-             RAJA::LoopPolicy<RAJA::hip_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::hip_thread_x_direct>
-           >;
+    camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::hip_block_x_direct>,
+               RAJA::LoopPolicy<RAJA::hip_thread_x_direct>>;
 
 using Hip_launch_policies = camp::list<hip_direct_policies>;
 
-#endif // RAJA_ENABLE_HIP
+#endif  // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
 
 using sycl_direct_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-             RAJA::LoopPolicy<RAJA::sycl_group_2_direct>,
-             RAJA::LoopPolicy<RAJA::sycl_local_2_direct>
-            >;
+    camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::sycl_group_2_direct>,
+               RAJA::LoopPolicy<RAJA::sycl_local_2_direct>>;
 
 using Sycl_launch_policies = camp::list<sycl_direct_policies>;
 
diff --git a/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp b/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp
index f84823e414..258809a569 100644
--- a/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp
+++ b/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp
@@ -15,32 +15,26 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-//Launch policies
-using seq_policies = 
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>
-            >;
+// Launch policies
+using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
 
 using Sequential_launch_policies = camp::list<seq_policies>;
-                                              
+
 #if defined(RAJA_ENABLE_OPENMP)
 
-using omp_policies = 
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-             RAJA::LoopPolicy<RAJA::omp_for_exec>,  
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>
-            >;
+using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+                                RAJA::LoopPolicy<RAJA::omp_for_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
 
 using OpenMP_launch_policies = camp::list<omp_policies>;
 
@@ -48,68 +42,57 @@ using OpenMP_launch_policies = camp::list<omp_policies>;
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using cuda_direct_policies = 
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
-             RAJA::LoopPolicy<RAJA::cuda_block_z_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_block_y_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>
-            >;
-
-using cuda_direct_explicit_policies = 
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
-             RAJA::LoopPolicy<RAJA::cuda_block_z_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_block_y_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>
-            >;
-
-using Cuda_launch_policies = 
-  camp::list<
-             cuda_direct_policies,
-             cuda_direct_explicit_policies
-             >;
+using cuda_direct_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
+               RAJA::LoopPolicy<RAJA::cuda_block_z_direct>,
+               RAJA::LoopPolicy<RAJA::cuda_block_y_direct>,
+               RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>>;
+
+using cuda_direct_explicit_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+    RAJA::LoopPolicy<RAJA::cuda_block_z_direct>,
+    RAJA::LoopPolicy<RAJA::cuda_block_y_direct>,
+    RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>>;
+
+using Cuda_launch_policies =
+    camp::list<cuda_direct_policies, cuda_direct_explicit_policies>;
 
 #endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
-using hip_direct_policies = 
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-             RAJA::LoopPolicy<RAJA::hip_block_z_direct>,
-             RAJA::LoopPolicy<RAJA::hip_block_y_direct>,
-             RAJA::LoopPolicy<RAJA::hip_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::hip_thread_z_direct>,
-             RAJA::LoopPolicy<RAJA::hip_thread_y_direct>,
-             RAJA::LoopPolicy<RAJA::hip_thread_x_direct>
-           >;
+using hip_direct_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::hip_block_z_direct>,
+               RAJA::LoopPolicy<RAJA::hip_block_y_direct>,
+               RAJA::LoopPolicy<RAJA::hip_block_x_direct>,
+               RAJA::LoopPolicy<RAJA::hip_thread_z_direct>,
+               RAJA::LoopPolicy<RAJA::hip_thread_y_direct>,
+               RAJA::LoopPolicy<RAJA::hip_thread_x_direct>>;
 
 using Hip_launch_policies = camp::list<hip_direct_policies>;
 
-#endif // RAJA_ENABLE_HIP
+#endif  // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
 
-using sycl_direct_policies = 
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-             RAJA::LoopPolicy<RAJA::sycl_group_0_direct>, //slowest
-             RAJA::LoopPolicy<RAJA::sycl_group_1_direct>,
-             RAJA::LoopPolicy<RAJA::sycl_group_2_direct>, //fastest
-             RAJA::LoopPolicy<RAJA::sycl_local_0_direct>,
-             RAJA::LoopPolicy<RAJA::sycl_local_1_direct>,
-             RAJA::LoopPolicy<RAJA::sycl_local_2_direct>
-            >;
+using sycl_direct_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::sycl_group_0_direct>,  // slowest
+               RAJA::LoopPolicy<RAJA::sycl_group_1_direct>,
+               RAJA::LoopPolicy<RAJA::sycl_group_2_direct>,  // fastest
+               RAJA::LoopPolicy<RAJA::sycl_local_0_direct>,
+               RAJA::LoopPolicy<RAJA::sycl_local_1_direct>,
+               RAJA::LoopPolicy<RAJA::sycl_local_2_direct>>;
 
 using Sycl_launch_policies = camp::list<sycl_direct_policies>;
-                                        
+
 #endif
 
 
diff --git a/test/include/RAJA_test-launch-execpol.hpp b/test/include/RAJA_test-launch-execpol.hpp
index fea90a8305..5965621493 100644
--- a/test/include/RAJA_test-launch-execpol.hpp
+++ b/test/include/RAJA_test-launch-execpol.hpp
@@ -15,65 +15,47 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-//Launch policies
-using seq_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-  RAJA::LoopPolicy<RAJA::seq_exec>
-  >;
+// Launch policies
+using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
 
-using Sequential_launch_policies = camp::list<
-  seq_policies
-  >;
+using Sequential_launch_policies = camp::list<seq_policies>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using omp_policies = camp::list<
-         RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-         RAJA::LoopPolicy<RAJA::omp_for_exec>
-  >;
+using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+                                RAJA::LoopPolicy<RAJA::omp_for_exec>>;
 
-using OpenMP_launch_policies = camp::list<
-  omp_policies
-  >;
+using OpenMP_launch_policies = camp::list<omp_policies>;
 
 #endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using cuda_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::cuda_launch_t<true>>,
-  RAJA::LoopPolicy<RAJA::cuda_global_thread_x>>;
+using cuda_policies = camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<true>>,
+                                 RAJA::LoopPolicy<RAJA::cuda_global_thread_x>>;
 
 using cuda_explicit_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
-  RAJA::LoopPolicy<RAJA::cuda_global_thread_x>>;
+    RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+    RAJA::LoopPolicy<RAJA::cuda_global_thread_x>>;
 
-using Cuda_launch_policies = camp::list<
-        cuda_policies,
-        cuda_explicit_policies
-         >;
+using Cuda_launch_policies = camp::list<cuda_policies, cuda_explicit_policies>;
 #endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
-using hip_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-  RAJA::LoopPolicy<RAJA::hip_global_thread_x>>;
+using hip_policies = camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+                                RAJA::LoopPolicy<RAJA::hip_global_thread_x>>;
 
-using Hip_launch_policies = camp::list<
-      hip_policies
-       >;
-#endif // RAJA_ENABLE_HIP
+using Hip_launch_policies = camp::list<hip_policies>;
+#endif  // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
 
-using sycl_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-  RAJA::LoopPolicy<RAJA::sycl_global_item_2>>;
+using sycl_policies = camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+                                 RAJA::LoopPolicy<RAJA::sycl_global_item_2>>;
 
-using Sycl_launch_policies = camp::list<
-      sycl_policies
-       >;
-#endif // RAJA_ENABLE_SYCL
+using Sycl_launch_policies = camp::list<sycl_policies>;
+#endif  // RAJA_ENABLE_SYCL
 
 
 #endif  // __RAJA_TEST_LAUNCH_EXECPOL_HPP__
diff --git a/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp b/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp
index 6173fc6ffa..bed8b99cd6 100644
--- a/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp
+++ b/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp
@@ -15,73 +15,55 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-//Launch policies
-using seq_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-  RAJA::LoopPolicy<RAJA::seq_exec>,
-  RAJA::LoopPolicy<RAJA::seq_exec>
-  >;
+// Launch policies
+using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
 
-using Sequential_launch_policies = camp::list<
-  seq_policies
-  >;
+using Sequential_launch_policies = camp::list<seq_policies>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using omp_policies = camp::list<
-         RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-         RAJA::LoopPolicy<RAJA::omp_for_exec>,  
-         RAJA::LoopPolicy<RAJA::seq_exec>
-  >;
+using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+                                RAJA::LoopPolicy<RAJA::omp_for_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
 
-using OpenMP_launch_policies = camp::list<
-  omp_policies
-  >;
+using OpenMP_launch_policies = camp::list<omp_policies>;
 
 #endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using cuda_loop_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
-  RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>
-  >;
+using cuda_loop_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
+               RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>>;
 
 using cuda_loop_explicit_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
-  RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>
-  >;
-
-using Cuda_launch_policies = camp::list<
-  cuda_loop_policies,
-  cuda_loop_explicit_policies
-  >;
+    RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+    RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>>;
+
+using Cuda_launch_policies =
+    camp::list<cuda_loop_policies, cuda_loop_explicit_policies>;
 #endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
-using hip_loop_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-  RAJA::LoopPolicy<RAJA::hip_block_x_loop>,
-  RAJA::LoopPolicy<RAJA::hip_thread_x_loop>
-  >;
+using hip_loop_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::hip_block_x_loop>,
+               RAJA::LoopPolicy<RAJA::hip_thread_x_loop>>;
 
-using Hip_launch_policies = camp::list<
-      hip_loop_policies
-       >;
-#endif // RAJA_ENABLE_HIP
+using Hip_launch_policies = camp::list<hip_loop_policies>;
+#endif  // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
-using sycl_loop_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-  RAJA::LoopPolicy<RAJA::sycl_group_2_loop>,
-  RAJA::LoopPolicy<RAJA::sycl_local_2_loop>
-  >;
-
-using Sycl_launch_policies = camp::list<  
-  sycl_loop_policies
-  >;
+using sycl_loop_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::sycl_group_2_loop>,
+               RAJA::LoopPolicy<RAJA::sycl_local_2_loop>>;
+
+using Sycl_launch_policies = camp::list<sycl_loop_policies>;
 #endif
 
 
diff --git a/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp b/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp
index d703216a13..7f4dd17486 100644
--- a/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp
+++ b/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp
@@ -15,97 +15,79 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-//Launch policies
-using seq_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-  RAJA::LoopPolicy<RAJA::seq_exec>,
-  RAJA::LoopPolicy<RAJA::seq_exec>,
-  RAJA::LoopPolicy<RAJA::seq_exec>,
-  RAJA::LoopPolicy<RAJA::seq_exec>,
-  RAJA::LoopPolicy<RAJA::seq_exec>,
-  RAJA::LoopPolicy<RAJA::seq_exec>
-  >;
-
-using Sequential_launch_policies = camp::list<
-  seq_policies
-  >;
+// Launch policies
+using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
+
+using Sequential_launch_policies = camp::list<seq_policies>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using omp_policies = camp::list<
-         RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-         RAJA::LoopPolicy<RAJA::omp_for_exec>,  
-         RAJA::LoopPolicy<RAJA::seq_exec>,
-         RAJA::LoopPolicy<RAJA::seq_exec>,
-         RAJA::LoopPolicy<RAJA::seq_exec>,
-         RAJA::LoopPolicy<RAJA::seq_exec>,
-         RAJA::LoopPolicy<RAJA::seq_exec>
-  >;
-
-using OpenMP_launch_policies = camp::list<
-  omp_policies
-  >;
+using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+                                RAJA::LoopPolicy<RAJA::omp_for_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
+
+using OpenMP_launch_policies = camp::list<omp_policies>;
 
 #endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using cuda_loop_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
-  RAJA::LoopPolicy<RAJA::cuda_block_z_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_block_y_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_thread_z_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>
-  >;
+using cuda_loop_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
+               RAJA::LoopPolicy<RAJA::cuda_block_z_loop>,
+               RAJA::LoopPolicy<RAJA::cuda_block_y_loop>,
+               RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_z_loop>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>>;
 
 using cuda_loop_explicit_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
-  RAJA::LoopPolicy<RAJA::cuda_block_z_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_block_y_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_thread_z_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>
-  >;
-
-using Cuda_launch_policies = camp::list<
-  cuda_loop_policies,
-  cuda_loop_explicit_policies
-  >;
+    RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+    RAJA::LoopPolicy<RAJA::cuda_block_z_loop>,
+    RAJA::LoopPolicy<RAJA::cuda_block_y_loop>,
+    RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_z_loop>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>>;
+
+using Cuda_launch_policies =
+    camp::list<cuda_loop_policies, cuda_loop_explicit_policies>;
 #endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
-using hip_loop_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-  RAJA::LoopPolicy<RAJA::hip_block_z_loop>,
-  RAJA::LoopPolicy<RAJA::hip_block_y_loop>,
-  RAJA::LoopPolicy<RAJA::hip_block_x_loop>,
-  RAJA::LoopPolicy<RAJA::hip_thread_z_loop>,
-  RAJA::LoopPolicy<RAJA::hip_thread_y_loop>,
-  RAJA::LoopPolicy<RAJA::hip_thread_x_loop>
-  >;
-
-using Hip_launch_policies = camp::list<
-      hip_loop_policies
-       >;
-#endif // RAJA_ENABLE_HIP
+using hip_loop_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::hip_block_z_loop>,
+               RAJA::LoopPolicy<RAJA::hip_block_y_loop>,
+               RAJA::LoopPolicy<RAJA::hip_block_x_loop>,
+               RAJA::LoopPolicy<RAJA::hip_thread_z_loop>,
+               RAJA::LoopPolicy<RAJA::hip_thread_y_loop>,
+               RAJA::LoopPolicy<RAJA::hip_thread_x_loop>>;
+
+using Hip_launch_policies = camp::list<hip_loop_policies>;
+#endif  // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
-using sycl_loop_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-  RAJA::LoopPolicy<RAJA::sycl_group_0_loop>, //slowest index
-  RAJA::LoopPolicy<RAJA::sycl_group_1_loop>,
-  RAJA::LoopPolicy<RAJA::sycl_group_2_loop>, //fastest index
-  RAJA::LoopPolicy<RAJA::sycl_local_0_loop>,
-  RAJA::LoopPolicy<RAJA::sycl_local_1_loop>,
-  RAJA::LoopPolicy<RAJA::sycl_local_2_loop>
-  >;
-
-using Sycl_launch_policies = camp::list<  
-  sycl_loop_policies
-  >;
+using sycl_loop_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::sycl_group_0_loop>,  // slowest index
+               RAJA::LoopPolicy<RAJA::sycl_group_1_loop>,
+               RAJA::LoopPolicy<RAJA::sycl_group_2_loop>,  // fastest index
+               RAJA::LoopPolicy<RAJA::sycl_local_0_loop>,
+               RAJA::LoopPolicy<RAJA::sycl_local_1_loop>,
+               RAJA::LoopPolicy<RAJA::sycl_local_2_loop>>;
+
+using Sycl_launch_policies = camp::list<sycl_loop_policies>;
 #endif
 
 
diff --git a/test/include/RAJA_test-launch-runtime-execpol.hpp b/test/include/RAJA_test-launch-runtime-execpol.hpp
index fa2b39f761..0d896c7880 100644
--- a/test/include/RAJA_test-launch-runtime-execpol.hpp
+++ b/test/include/RAJA_test-launch-runtime-execpol.hpp
@@ -15,158 +15,129 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-//Launch policies
+// Launch policies
 #if defined(RAJA_ENABLE_CUDA)
-using seq_cuda_policies =
-  camp::list<
-              RAJA::LaunchPolicy<RAJA::seq_launch_t,RAJA::cuda_launch_t<true>>,
-              RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_block_x_direct>,
-              RAJA::LoopPolicy<RAJA::seq_exec,RAJA::cuda_thread_x_loop>
-            >;
-
-using seq_cuda_explicit_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::seq_launch_t,RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
-             RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::cuda_thread_x_loop>
-            >;
+using seq_cuda_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::cuda_launch_t<true>>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_block_x_direct>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_thread_x_loop>>;
+
+using seq_cuda_explicit_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::seq_launch_t,
+                       RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_block_x_direct>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_thread_x_loop>>;
 
 using Sequential_launch_policies =
-  camp::list<
-             seq_cuda_policies,
-             seq_cuda_explicit_policies
-            >;
+    camp::list<seq_cuda_policies, seq_cuda_explicit_policies>;
 
 #elif defined(RAJA_ENABLE_HIP)
 using seq_hip_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::seq_launch_t,RAJA::hip_launch_t<true>>,
-             RAJA::LoopPolicy<RAJA::seq_exec, RAJA::hip_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::hip_thread_x_loop>
-            >;
+    camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::hip_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::seq_exec, RAJA::hip_block_x_direct>,
+               RAJA::LoopPolicy<RAJA::seq_exec, RAJA::hip_thread_x_loop>>;
 
 using Sequential_launch_policies = camp::list<seq_hip_policies>;
 
 #elif defined(RAJA_ENABLE_SYCL)
 
-using seq_sycl_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::seq_launch_t,RAJA::sycl_launch_t<true>>,
-             RAJA::LoopPolicy<RAJA::seq_exec, RAJA::sycl_group_2_direct>,
-             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::sycl_local_2_loop>
-            >;
+using seq_sycl_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::sycl_launch_t<true>>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::sycl_group_2_direct>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::sycl_local_2_loop>>;
 
 using Sequential_launch_policies = camp::list<seq_sycl_policies>;
 
 #else
 using Sequential_launch_policies =
-  camp::list<
-    camp::list<
-               RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-               RAJA::LoopPolicy<RAJA::seq_exec>,
-               RAJA::LoopPolicy<RAJA::seq_exec>
-              >
-            >;
-#endif // Sequential
+    camp::list<camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+                          RAJA::LoopPolicy<RAJA::seq_exec>,
+                          RAJA::LoopPolicy<RAJA::seq_exec>>>;
+#endif  // Sequential
 
 
 #if defined(RAJA_ENABLE_OPENMP)
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using omp_cuda_policies =
-  camp::list<
-              RAJA::LaunchPolicy<RAJA::omp_launch_t,RAJA::cuda_launch_t<false>>,
-              RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::cuda_block_x_direct>,
-              RAJA::LoopPolicy<RAJA::seq_exec,RAJA::cuda_thread_x_loop>
-            >;
+using omp_cuda_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::omp_launch_t, RAJA::cuda_launch_t<false>>,
+    RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::cuda_block_x_direct>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_thread_x_loop>>;
 
-using omp_cuda_explicit_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::omp_launch_t,RAJA::policy::cuda::cuda_launch_explicit_t<false, 0, 0>>,
-             RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::cuda_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::cuda_thread_x_loop>
-            >;
+using omp_cuda_explicit_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::omp_launch_t,
+                       RAJA::policy::cuda::cuda_launch_explicit_t<false, 0, 0>>,
+    RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::cuda_block_x_direct>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_thread_x_loop>>;
 
 using OpenMP_launch_policies =
-  camp::list<
-             omp_cuda_policies,
-             omp_cuda_explicit_policies
-            >;
+    camp::list<omp_cuda_policies, omp_cuda_explicit_policies>;
 
 #elif defined(RAJA_ENABLE_HIP)
 
-using omp_hip_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::omp_launch_t,RAJA::hip_launch_t<false>>,
-             RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::hip_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::hip_thread_x_loop>
-            >;
+using omp_hip_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::omp_launch_t, RAJA::hip_launch_t<false>>,
+    RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::hip_block_x_direct>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::hip_thread_x_loop>>;
 
 using OpenMP_launch_policies = camp::list<omp_hip_policies>;
 
 #elif defined(RAJA_ENABLE_SYCL)
 
-using omp_sycl_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::omp_launch_t,RAJA::sycl_launch_t<false>>,
-             RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::sycl_group_2_direct>,
-             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::sycl_local_2_loop>
-            >;
+using omp_sycl_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::omp_launch_t, RAJA::sycl_launch_t<false>>,
+    RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::sycl_group_2_direct>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::sycl_local_2_loop>>;
 
 using OpenMP_launch_policies = camp::list<omp_sycl_policies>;
 
 #else
 
 using OpenMP_launch_policies =
-  camp::list<
-    camp::list<
-                RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-                RAJA::LoopPolicy<RAJA::omp_parallel_for_exec>,
-                RAJA::LoopPolicy<RAJA::seq_exec>
-               >
-             >;
+    camp::list<camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+                          RAJA::LoopPolicy<RAJA::omp_parallel_for_exec>,
+                          RAJA::LoopPolicy<RAJA::seq_exec>>>;
 #endif
 
 #endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using Cuda_launch_policies =
-  camp::list<
-             seq_cuda_policies
-            ,seq_cuda_explicit_policies
+using Cuda_launch_policies = camp::list<seq_cuda_policies,
+                                        seq_cuda_explicit_policies
 
 #if defined(RAJA_ENABLE_OPENMP)
-            ,omp_cuda_policies
-            ,omp_cuda_explicit_policies
+                                        ,
+                                        omp_cuda_policies,
+                                        omp_cuda_explicit_policies
 #endif
 
-           >;
+                                        >;
 #endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
-using Hip_launch_policies = camp::list<
-         seq_hip_policies
+using Hip_launch_policies = camp::list<seq_hip_policies
 
 #if defined(RAJA_ENABLE_OPENMP)
-         , omp_hip_policies
+                                       ,
+                                       omp_hip_policies
 #endif
-        >;
+                                       >;
 
-#endif // RAJA_ENABLE_HIP
+#endif  // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
 
-using Sycl_launch_policies = camp::list<
-         seq_sycl_policies
+using Sycl_launch_policies = camp::list<seq_sycl_policies
 
 #if defined(RAJA_ENABLE_OPENMP)
-         , omp_sycl_policies
+                                        ,
+                                        omp_sycl_policies
 #endif
-        >;
+                                        >;
 
-#endif // RAJA_ENABLE_SYCL
+#endif  // RAJA_ENABLE_SYCL
 
 #endif  // __RAJA_TEST_LAUNCH_RUNTIME_EXECPOL_HPP__
diff --git a/test/include/RAJA_test-multi-reduce-abstractor.hpp b/test/include/RAJA_test-multi-reduce-abstractor.hpp
index 2c5412893c..1dd618a72e 100644
--- a/test/include/RAJA_test-multi-reduce-abstractor.hpp
+++ b/test/include/RAJA_test-multi-reduce-abstractor.hpp
@@ -18,7 +18,7 @@
 //
 // Get the identity value for the operation used by the given multi reducer
 //
-template < typename MultiReducer >
+template <typename MultiReducer>
 inline auto get_op_identity(MultiReducer const& RAJA_UNUSED_ARG(multi_reduce))
 {
   return MultiReducer::MultiReduceOp::identity();
@@ -27,144 +27,207 @@ inline auto get_op_identity(MultiReducer const& RAJA_UNUSED_ARG(multi_reduce))
 
 struct SumAbstractor
 {
-  template < typename DATA_TYPE >
-  static constexpr bool supports() { return std::is_arithmetic<DATA_TYPE>::value; }
+  template <typename DATA_TYPE>
+  static constexpr bool supports()
+  {
+    return std::is_arithmetic<DATA_TYPE>::value;
+  }
 
-  template < typename Reducer >
+  template <typename Reducer>
   static bool consistent(Reducer const&)
   {
-    return RAJA::policy_has_trait<typename Reducer::policy, RAJA::reduce::ordered>::value ||
+    return RAJA::policy_has_trait<typename Reducer::policy,
+                                  RAJA::reduce::ordered>::value ||
            !std::is_floating_point<typename Reducer::value_type>::value;
   }
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using reducer = RAJA::ReduceSum<policy, DATA_TYPE>;
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using multi_reducer = RAJA::MultiReduceSum<policy, DATA_TYPE>;
 
-  template < typename Lhs, typename Rhs >
-  RAJA_HOST_DEVICE
-  static auto combine(Lhs const& lhs, Rhs const& rhs) { return lhs + rhs; }
+  template <typename Lhs, typename Rhs>
+  RAJA_HOST_DEVICE static auto combine(Lhs const& lhs, Rhs const& rhs)
+  {
+    return lhs + rhs;
+  }
 
-  template < typename Reducer, typename Rhs >
-  RAJA_HOST_DEVICE
-  static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward<Reducer>(lhs) += rhs; }
+  template <typename Reducer, typename Rhs>
+  RAJA_HOST_DEVICE static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs)
+  {
+    return std::forward<Reducer>(lhs) += rhs;
+  }
 
-  template < typename Reducer >
-  static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); }
+  template <typename Reducer>
+  static auto identity(Reducer const&)
+  {
+    return Reducer::MultiReduceOp::identity();
+  }
 };
 
 struct MinAbstractor
 {
-  template < typename DATA_TYPE >
-  static constexpr bool supports() { return std::is_arithmetic<DATA_TYPE>::value; }
+  template <typename DATA_TYPE>
+  static constexpr bool supports()
+  {
+    return std::is_arithmetic<DATA_TYPE>::value;
+  }
 
-  template < typename Reducer >
-  static constexpr bool consistent(Reducer const&) { return true; }
+  template <typename Reducer>
+  static constexpr bool consistent(Reducer const&)
+  {
+    return true;
+  }
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using reducer = RAJA::ReduceSum<policy, DATA_TYPE>;
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using multi_reducer = RAJA::MultiReduceMin<policy, DATA_TYPE>;
 
-  template < typename Lhs, typename Rhs >
-  RAJA_HOST_DEVICE
-  static auto combine(Lhs const& lhs, Rhs const& rhs) { return (lhs > rhs) ? rhs : lhs; }
+  template <typename Lhs, typename Rhs>
+  RAJA_HOST_DEVICE static auto combine(Lhs const& lhs, Rhs const& rhs)
+  {
+    return (lhs > rhs) ? rhs : lhs;
+  }
 
-  template < typename Reducer, typename Rhs >
-  RAJA_HOST_DEVICE
-  static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward<Reducer>(lhs).min(rhs); }
+  template <typename Reducer, typename Rhs>
+  RAJA_HOST_DEVICE static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs)
+  {
+    return std::forward<Reducer>(lhs).min(rhs);
+  }
 
-  template < typename Reducer >
-  static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); }
+  template <typename Reducer>
+  static auto identity(Reducer const&)
+  {
+    return Reducer::MultiReduceOp::identity();
+  }
 };
 
 struct MaxAbstractor
 {
-  template < typename DATA_TYPE >
-  static constexpr bool supports() { return std::is_arithmetic<DATA_TYPE>::value; }
+  template <typename DATA_TYPE>
+  static constexpr bool supports()
+  {
+    return std::is_arithmetic<DATA_TYPE>::value;
+  }
 
-  template < typename Reducer >
-  static constexpr bool consistent(Reducer const&) { return true; }
+  template <typename Reducer>
+  static constexpr bool consistent(Reducer const&)
+  {
+    return true;
+  }
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using reducer = RAJA::ReduceSum<policy, DATA_TYPE>;
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using multi_reducer = RAJA::MultiReduceMax<policy, DATA_TYPE>;
 
-  template < typename Lhs, typename Rhs >
-  RAJA_HOST_DEVICE
-  static auto combine(Lhs const& lhs, Rhs const& rhs) { return (lhs < rhs) ? rhs : lhs; }
+  template <typename Lhs, typename Rhs>
+  RAJA_HOST_DEVICE static auto combine(Lhs const& lhs, Rhs const& rhs)
+  {
+    return (lhs < rhs) ? rhs : lhs;
+  }
 
-  template < typename Reducer, typename Rhs >
-  RAJA_HOST_DEVICE
-  static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward<Reducer>(lhs).max(rhs); }
+  template <typename Reducer, typename Rhs>
+  RAJA_HOST_DEVICE static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs)
+  {
+    return std::forward<Reducer>(lhs).max(rhs);
+  }
 
-  template < typename Reducer >
-  static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); }
+  template <typename Reducer>
+  static auto identity(Reducer const&)
+  {
+    return Reducer::MultiReduceOp::identity();
+  }
 };
 
 struct BitAndAbstractor
 {
-  template < typename DATA_TYPE >
-  static constexpr bool supports() { return std::is_integral<DATA_TYPE>::value; }
+  template <typename DATA_TYPE>
+  static constexpr bool supports()
+  {
+    return std::is_integral<DATA_TYPE>::value;
+  }
 
-  template < typename Reducer >
-  static constexpr bool consistent(Reducer const&) { return true; }
+  template <typename Reducer>
+  static constexpr bool consistent(Reducer const&)
+  {
+    return true;
+  }
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using reducer = RAJA::ReduceSum<policy, DATA_TYPE>;
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using multi_reducer = RAJA::MultiReduceBitAnd<policy, DATA_TYPE>;
 
-  template < typename Lhs, typename Rhs >
-  RAJA_HOST_DEVICE
-  static auto combine(Lhs const& lhs, Rhs const& rhs) { return lhs & rhs; }
+  template <typename Lhs, typename Rhs>
+  RAJA_HOST_DEVICE static auto combine(Lhs const& lhs, Rhs const& rhs)
+  {
+    return lhs & rhs;
+  }
 
-  template < typename Reducer, typename Rhs >
-  RAJA_HOST_DEVICE
-  static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward<Reducer>(lhs) &= rhs; }
+  template <typename Reducer, typename Rhs>
+  RAJA_HOST_DEVICE static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs)
+  {
+    return std::forward<Reducer>(lhs) &= rhs;
+  }
 
-  template < typename Reducer >
-  static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); }
+  template <typename Reducer>
+  static auto identity(Reducer const&)
+  {
+    return Reducer::MultiReduceOp::identity();
+  }
 };
 
 struct BitOrAbstractor
 {
-  template < typename DATA_TYPE >
-  static constexpr bool supports() { return std::is_integral<DATA_TYPE>::value; }
+  template <typename DATA_TYPE>
+  static constexpr bool supports()
+  {
+    return std::is_integral<DATA_TYPE>::value;
+  }
 
-  template < typename Reducer >
-  static constexpr bool consistent(Reducer const&) { return true; }
+  template <typename Reducer>
+  static constexpr bool consistent(Reducer const&)
+  {
+    return true;
+  }
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using reducer = RAJA::ReduceSum<policy, DATA_TYPE>;
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using multi_reducer = RAJA::MultiReduceBitOr<policy, DATA_TYPE>;
 
-  template < typename Lhs, typename Rhs >
-  RAJA_HOST_DEVICE
-  static auto combine(Lhs const& lhs, Rhs const& rhs) { return lhs | rhs; }
+  template <typename Lhs, typename Rhs>
+  RAJA_HOST_DEVICE static auto combine(Lhs const& lhs, Rhs const& rhs)
+  {
+    return lhs | rhs;
+  }
 
-  template < typename Reducer, typename Rhs >
-  RAJA_HOST_DEVICE
-  static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward<Reducer>(lhs) |= rhs; }
+  template <typename Reducer, typename Rhs>
+  RAJA_HOST_DEVICE static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs)
+  {
+    return std::forward<Reducer>(lhs) |= rhs;
+  }
 
-  template < typename Reducer >
-  static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); }
+  template <typename Reducer>
+  static auto identity(Reducer const&)
+  {
+    return Reducer::MultiReduceOp::identity();
+  }
 };
 
 
 // Sequential reduction policy types
-using ReduceSumAbstractors = camp::list< SumAbstractor >;
-using ReduceMinAbstractors = camp::list< MinAbstractor >;
-using ReduceMaxAbstractors = camp::list< MaxAbstractor >;
-using ReduceBitAndAbstractors = camp::list< BitAndAbstractor >;
-using ReduceBitOrAbstractors = camp::list< BitOrAbstractor >;
+using ReduceSumAbstractors    = camp::list<SumAbstractor>;
+using ReduceMinAbstractors    = camp::list<MinAbstractor>;
+using ReduceMaxAbstractors    = camp::list<MaxAbstractor>;
+using ReduceBitAndAbstractors = camp::list<BitAndAbstractor>;
+using ReduceBitOrAbstractors  = camp::list<BitOrAbstractor>;
 
 #endif  // __RAJA_test_multi_reduce_abstractor_HPP__
diff --git a/test/include/RAJA_test-multi-reducepol.hpp b/test/include/RAJA_test-multi-reducepol.hpp
index e024ef70aa..3e962c6df2 100644
--- a/test/include/RAJA_test-multi-reducepol.hpp
+++ b/test/include/RAJA_test-multi-reducepol.hpp
@@ -16,28 +16,29 @@
 #include "camp/list.hpp"
 
 // Sequential reduction policy types
-using SequentialMultiReducePols = camp::list< RAJA::seq_multi_reduce >;
+using SequentialMultiReducePols = camp::list<RAJA::seq_multi_reduce>;
 
 #if defined(RAJA_ENABLE_OPENMP)
 using OpenMPMultiReducePols =
-  camp::list< RAJA::omp_multi_reduce,
-              RAJA::omp_multi_reduce_ordered >;
+    camp::list<RAJA::omp_multi_reduce, RAJA::omp_multi_reduce_ordered>;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaMultiReducePols =
-  camp::list< RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init,
-              RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
-              RAJA::cuda_multi_reduce_atomic_global_host_init,
-              RAJA::cuda_multi_reduce_atomic_global_no_replication_host_init >;
+using CudaMultiReducePols = camp::list<
+    RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init,
+    RAJA::
+        cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
+    RAJA::cuda_multi_reduce_atomic_global_host_init,
+    RAJA::cuda_multi_reduce_atomic_global_no_replication_host_init>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipMultiReducePols =
-  camp::list< RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init,
-              RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
-              RAJA::hip_multi_reduce_atomic_global_host_init,
-              RAJA::hip_multi_reduce_atomic_global_no_replication_host_init  >;
+using HipMultiReducePols = camp::list<
+    RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init,
+    RAJA::
+        hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
+    RAJA::hip_multi_reduce_atomic_global_host_init,
+    RAJA::hip_multi_reduce_atomic_global_no_replication_host_init>;
 #endif
 
 #endif  // __RAJA_test_multi_reducepol_HPP__
diff --git a/test/include/RAJA_test-platform.hpp b/test/include/RAJA_test-platform.hpp
index 7862461f18..ecdf7e1a56 100644
--- a/test/include/RAJA_test-platform.hpp
+++ b/test/include/RAJA_test-platform.hpp
@@ -16,10 +16,10 @@
 
 #include "camp/list.hpp"
 
-template < RAJA::Platform PLATFORM >
+template <RAJA::Platform PLATFORM>
 struct PlatformHolder
 {
-   static const RAJA::Platform platform = PLATFORM;
+  static const RAJA::Platform platform = PLATFORM;
 };
 
 //
@@ -38,11 +38,12 @@ using CudaPlatformList = camp::list<PlatformHolder<RAJA::Platform::cuda>>;
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetPlatformList = camp::list<PlatformHolder<RAJA::Platform::omp_target>>;
+using OpenMPTargetPlatformList =
+    camp::list<PlatformHolder<RAJA::Platform::omp_target>>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 using HipPlatformList = camp::list<PlatformHolder<RAJA::Platform::hip>>;
 #endif
 
-#endif // __RAJA_test_platform_HPP__
+#endif  // __RAJA_test_platform_HPP__
diff --git a/test/include/RAJA_test-plugin-kernelpol.hpp b/test/include/RAJA_test-plugin-kernelpol.hpp
index 9c3f0e2e52..0ef68cbb7c 100644
--- a/test/include/RAJA_test-plugin-kernelpol.hpp
+++ b/test/include/RAJA_test-plugin-kernelpol.hpp
@@ -18,86 +18,90 @@
 
 // Sequential execution policy types
 using SequentialPluginKernelExecPols = camp::list<
-      RAJA::KernelPolicy<
-        RAJA::statement::For<0, RAJA::seq_exec,
-          RAJA::statement::Lambda<0>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::Tile<0, RAJA::tile_fixed<2>, RAJA::seq_exec,
-          RAJA::statement::For<0, RAJA::seq_exec,
-            RAJA::statement::Lambda<0>>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::For<0, RAJA::simd_exec,
-          RAJA::statement::Lambda<0>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::Tile<0, RAJA::tile_fixed<2>, RAJA::seq_exec,
-          RAJA::statement::For<0, RAJA::simd_exec,
-            RAJA::statement::Lambda<0>>>>
-    >;
+    RAJA::KernelPolicy<
+        RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>,
+    RAJA::KernelPolicy<RAJA::statement::Tile<
+        0,
+        RAJA::tile_fixed<2>,
+        RAJA::seq_exec,
+        RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>,
+    RAJA::KernelPolicy<
+        RAJA::statement::For<0, RAJA::simd_exec, RAJA::statement::Lambda<0>>>,
+    RAJA::KernelPolicy<RAJA::statement::Tile<
+        0,
+        RAJA::tile_fixed<2>,
+        RAJA::seq_exec,
+        RAJA::statement::For<0, RAJA::simd_exec, RAJA::statement::Lambda<0>>>>>;
 
 #if defined(RAJA_ENABLE_OPENMP)
 using OpenMPPluginKernelExecPols = camp::list<
-      RAJA::KernelPolicy<
-        RAJA::statement::For<0, RAJA::omp_parallel_for_exec,
-          RAJA::statement::Lambda<0>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::Tile<0, RAJA::tile_fixed<2>, RAJA::omp_parallel_for_exec,
-          RAJA::statement::For<0, RAJA::seq_exec,
-            RAJA::statement::Lambda<0>>>>
-    >;
+    RAJA::KernelPolicy<RAJA::statement::For<0,
+                                            RAJA::omp_parallel_for_exec,
+                                            RAJA::statement::Lambda<0>>>,
+    RAJA::KernelPolicy<RAJA::statement::Tile<
+        0,
+        RAJA::tile_fixed<2>,
+        RAJA::omp_parallel_for_exec,
+        RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>;
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetPluginKernelExecPols = camp::list<
-      RAJA::KernelPolicy<
-        RAJA::statement::For<0, RAJA::omp_target_parallel_for_exec<64>,
-          RAJA::statement::Lambda<0>>>
-    >;
+using OpenMPTargetPluginKernelExecPols = camp::list<RAJA::KernelPolicy<
+    RAJA::statement::For<0,
+                         RAJA::omp_target_parallel_for_exec<64>,
+                         RAJA::statement::Lambda<0>>>>;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
 using CudaPluginKernelExecPols = camp::list<
-      RAJA::KernelPolicy<
-        RAJA::statement::CudaKernel<
-          RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
-            RAJA::statement::Lambda<0>>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::CudaKernel<
-          RAJA::statement::Tile<0, RAJA::tile_fixed<128>, RAJA::cuda_block_x_direct,
-            RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-              RAJA::statement::Lambda<0>>>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::CudaKernelFixed<128,
-          RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
-            RAJA::statement::Lambda<0>>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::CudaKernelFixed<128,
-          RAJA::statement::Tile<0, RAJA::tile_fixed<128>, RAJA::cuda_block_x_direct,
-            RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-              RAJA::statement::Lambda<0>>>>>
-    >;
+    RAJA::KernelPolicy<RAJA::statement::CudaKernel<
+        RAJA::statement::
+            For<0, RAJA::cuda_thread_x_loop, RAJA::statement::Lambda<0>>>>,
+    RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<
+        0,
+        RAJA::tile_fixed<128>,
+        RAJA::cuda_block_x_direct,
+        RAJA::statement::
+            For<0, RAJA::cuda_thread_x_direct, RAJA::statement::Lambda<0>>>>>,
+    RAJA::KernelPolicy<RAJA::statement::CudaKernelFixed<
+        128,
+        RAJA::statement::
+            For<0, RAJA::cuda_thread_x_loop, RAJA::statement::Lambda<0>>>>,
+    RAJA::KernelPolicy<RAJA::statement::CudaKernelFixed<
+        128,
+        RAJA::statement::Tile<
+            0,
+            RAJA::tile_fixed<128>,
+            RAJA::cuda_block_x_direct,
+            RAJA::statement::For<0,
+                                 RAJA::cuda_thread_x_direct,
+                                 RAJA::statement::Lambda<0>>>>>>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 using HipPluginKernelExecPols = camp::list<
-      RAJA::KernelPolicy<
-        RAJA::statement::HipKernel<
-          RAJA::statement::For<0, RAJA::hip_thread_x_loop,
-            RAJA::statement::Lambda<0>>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::HipKernel<
-          RAJA::statement::Tile<0, RAJA::tile_fixed<128>, RAJA::hip_block_x_direct,
-            RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-              RAJA::statement::Lambda<0>>>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::HipKernelFixed<128,
-          RAJA::statement::For<0, RAJA::hip_thread_x_loop,
-            RAJA::statement::Lambda<0>>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::HipKernelFixed<128,
-          RAJA::statement::Tile<0, RAJA::tile_fixed<128>, RAJA::hip_block_x_direct,
-            RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-              RAJA::statement::Lambda<0>>>>>
-    >;
+    RAJA::KernelPolicy<RAJA::statement::HipKernel<
+        RAJA::statement::
+            For<0, RAJA::hip_thread_x_loop, RAJA::statement::Lambda<0>>>>,
+    RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::Tile<
+        0,
+        RAJA::tile_fixed<128>,
+        RAJA::hip_block_x_direct,
+        RAJA::statement::
+            For<0, RAJA::hip_thread_x_direct, RAJA::statement::Lambda<0>>>>>,
+    RAJA::KernelPolicy<RAJA::statement::HipKernelFixed<
+        128,
+        RAJA::statement::
+            For<0, RAJA::hip_thread_x_loop, RAJA::statement::Lambda<0>>>>,
+    RAJA::KernelPolicy<RAJA::statement::HipKernelFixed<
+        128,
+        RAJA::statement::Tile<
+            0,
+            RAJA::tile_fixed<128>,
+            RAJA::hip_block_x_direct,
+            RAJA::statement::For<0,
+                                 RAJA::hip_thread_x_direct,
+                                 RAJA::statement::Lambda<0>>>>>>;
 #endif
 
 #endif  // __RAJA_test_plugin_kernelpol_HPP__
diff --git a/test/include/RAJA_test-plugin-launchpol.hpp b/test/include/RAJA_test-plugin-launchpol.hpp
index 2370084633..e086842f5f 100644
--- a/test/include/RAJA_test-plugin-launchpol.hpp
+++ b/test/include/RAJA_test-plugin-launchpol.hpp
@@ -17,18 +17,22 @@
 #include "camp/list.hpp"
 
 // Sequential execution policy types
-using SequentialPluginLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>>;
+using SequentialPluginLaunchExecPols =
+    camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPPluginLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>>;
+using OpenMPPluginLaunchExecPols =
+    camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>>;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaPluginLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>>;
+using CudaPluginLaunchExecPols =
+    camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipPluginLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<false>>>;
+using HipPluginLaunchExecPols =
+    camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<false>>>;
 
 #endif
 
diff --git a/test/include/RAJA_test-plugin-resource-launchpol.hpp b/test/include/RAJA_test-plugin-resource-launchpol.hpp
index 8d08574347..e1a2caf27e 100644
--- a/test/include/RAJA_test-plugin-resource-launchpol.hpp
+++ b/test/include/RAJA_test-plugin-resource-launchpol.hpp
@@ -17,18 +17,22 @@
 #include "camp/list.hpp"
 
 // Sequential execution policy types
-using SequentialPluginResourceLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>>;
+using SequentialPluginResourceLaunchExecPols =
+    camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPPluginResourceLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>>;
+using OpenMPPluginResourceLaunchExecPols =
+    camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>>;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaPluginResourceLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::cuda_launch_t<false>>>;
+using CudaPluginResourceLaunchExecPols = camp::list<
+    RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::cuda_launch_t<false>>>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipPluginResourceLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::hip_launch_t<false>>>;
+using HipPluginResourceLaunchExecPols = camp::list<
+    RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::hip_launch_t<false>>>;
 
 #endif
 
diff --git a/test/include/RAJA_test-reduce-types.hpp b/test/include/RAJA_test-reduce-types.hpp
index 8d8115321f..49d5cadaea 100644
--- a/test/include/RAJA_test-reduce-types.hpp
+++ b/test/include/RAJA_test-reduce-types.hpp
@@ -21,14 +21,13 @@
 //
 // Reduce data types
 //
-using ReduceDataTypeList =
-  camp::list< int,
+using ReduceDataTypeList = camp::list<int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              unsigned,
-              long long,
-              unsigned long long,
+                                      unsigned,
+                                      long long,
+                                      unsigned long long,
 #endif
-              float,
-              double >;
+                                      float,
+                                      double>;
 
-#endif // __RAJA_test_reduce_types_HPP__
+#endif  // __RAJA_test_reduce_types_HPP__
diff --git a/test/include/RAJA_test-reduceloc-types.hpp b/test/include/RAJA_test-reduceloc-types.hpp
index 336c7dd23e..a3387ee275 100644
--- a/test/include/RAJA_test-reduceloc-types.hpp
+++ b/test/include/RAJA_test-reduceloc-types.hpp
@@ -15,10 +15,13 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-struct Index2D {
-   RAJA::Index_type idx, idy;
-   constexpr Index2D() : idx(-1), idy(-1) {}
-   constexpr Index2D(RAJA::Index_type idx, RAJA::Index_type idy) : idx(idx), idy(idy) {}
+struct Index2D
+{
+  RAJA::Index_type idx, idy;
+  constexpr Index2D() : idx(-1), idy(-1) {}
+  constexpr Index2D(RAJA::Index_type idx, RAJA::Index_type idy)
+      : idx(idx), idy(idy)
+  {}
 };
 
-#endif // __RAJA_test_reduceloc_types_HPP__
+#endif  // __RAJA_test_reduceloc_types_HPP__
diff --git a/test/include/RAJA_test-reducepol.hpp b/test/include/RAJA_test-reducepol.hpp
index e9e075b287..66fc6f9c7a 100644
--- a/test/include/RAJA_test-reducepol.hpp
+++ b/test/include/RAJA_test-reducepol.hpp
@@ -16,43 +16,44 @@
 #include "camp/list.hpp"
 
 // Sequential reduction policy types
-using SequentialReducePols = camp::list< RAJA::seq_reduce >;
+using SequentialReducePols = camp::list<RAJA::seq_reduce>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPReducePols = 
-#if 0 // is ordered reduction broken???
+using OpenMPReducePols =
+#if 0  // is ordered reduction broken???
   camp::list< RAJA::omp_reduce,
               RAJA::omp_reduce_ordered >;
 #else
-  camp::list< RAJA::omp_reduce >;
+    camp::list<RAJA::omp_reduce>;
 #endif
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetReducePols =
-  camp::list< RAJA::omp_target_reduce >;
+using OpenMPTargetReducePols = camp::list<RAJA::omp_target_reduce>;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaReducePols = camp::list< RAJA::cuda_reduce_device_fence,
-                                   RAJA::cuda_reduce_block_fence,
-                                   RAJA::cuda_reduce_atomic_device_init_device_fence,
-                                   RAJA::cuda_reduce_atomic_device_init_block_fence,
-                                   RAJA::cuda_reduce_atomic_host_init_device_fence,
-                                   RAJA::cuda_reduce_atomic_host_init_block_fence >;
+using CudaReducePols =
+    camp::list<RAJA::cuda_reduce_device_fence,
+               RAJA::cuda_reduce_block_fence,
+               RAJA::cuda_reduce_atomic_device_init_device_fence,
+               RAJA::cuda_reduce_atomic_device_init_block_fence,
+               RAJA::cuda_reduce_atomic_host_init_device_fence,
+               RAJA::cuda_reduce_atomic_host_init_block_fence>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipReducePols = camp::list< RAJA::hip_reduce_device_fence,
-                                  RAJA::hip_reduce_block_fence,
-                                  RAJA::hip_reduce_atomic_device_init_device_fence,
-                                  RAJA::hip_reduce_atomic_device_init_block_fence,
-                                  RAJA::hip_reduce_atomic_host_init_device_fence,
-                                  RAJA::hip_reduce_atomic_host_init_block_fence >;
+using HipReducePols =
+    camp::list<RAJA::hip_reduce_device_fence,
+               RAJA::hip_reduce_block_fence,
+               RAJA::hip_reduce_atomic_device_init_device_fence,
+               RAJA::hip_reduce_atomic_device_init_block_fence,
+               RAJA::hip_reduce_atomic_host_init_device_fence,
+               RAJA::hip_reduce_atomic_host_init_block_fence>;
 #endif
 
 #if defined(RAJA_ENABLE_SYCL)
-using SyclReducePols = camp::list< RAJA::sycl_reduce >;
+using SyclReducePols = camp::list<RAJA::sycl_reduce>;
 #endif
 
 #endif  // __RAJA_test_reducepol_HPP__
diff --git a/test/include/RAJA_test-tensor.hpp b/test/include/RAJA_test-tensor.hpp
index cf633098a9..6c70d8583c 100644
--- a/test/include/RAJA_test-tensor.hpp
+++ b/test/include/RAJA_test-tensor.hpp
@@ -13,231 +13,248 @@
 #include "RAJA_gtest.hpp"
 
 
-using TensorElementTypes = ::testing::Types<
-        int,
-        long,
-        float,
-        double
-    >;
-
-template<typename POL>
-struct TensorTestHelper {
-
-    template<typename BODY>
-    static
-    void exec(BODY const &body){
-      body();
-    }
-
-    static constexpr bool is_device = false;
+using TensorElementTypes = ::testing::Types<int, long, float, double>;
+
+template <typename POL>
+struct TensorTestHelper
+{
+
+  template <typename BODY>
+  static void exec(BODY const& body)
+  {
+    body();
+  }
+
+  static constexpr bool is_device = false;
 };
 
 #ifdef RAJA_ENABLE_CUDA
 
 template <typename BODY>
-__global__
-void test_launcher(BODY body_in)
+__global__ void test_launcher(BODY body_in)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto& body      = privatizer.get_priv();
   body();
 }
 
-template<>
+template <>
 struct TensorTestHelper<RAJA::expt::cuda_warp_register>
 {
 
-    RAJA_SUPPRESS_HD_WARN
-    template<typename BODY>
-    static
-    void exec(BODY const &body){
-      cudaDeviceSynchronize();
-
-      test_launcher<<<1,32>>>(body);
+  RAJA_SUPPRESS_HD_WARN
+  template <typename BODY>
+  static void exec(BODY const& body)
+  {
+    cudaDeviceSynchronize();
 
-      cudaDeviceSynchronize();
+    test_launcher<<<1, 32>>>(body);
 
-    }
+    cudaDeviceSynchronize();
+  }
 
-    static constexpr bool is_device = true;
+  static constexpr bool is_device = true;
 };
 #endif
 
 
-
 #ifdef RAJA_ENABLE_HIP
 
 template <typename BODY>
-__global__
-void test_launcher(BODY body_in)
+__global__ void test_launcher(BODY body_in)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto& body      = privatizer.get_priv();
   body();
 }
 
-template<>
+template <>
 struct TensorTestHelper<RAJA::expt::hip_wave_register>
 {
 
-    template<typename BODY>
-    static
-    void exec(BODY const &body){
-      hipDeviceSynchronize();
+  template <typename BODY>
+  static void exec(BODY const& body)
+  {
+    hipDeviceSynchronize();
 
-      RAJA::forall<RAJA::hip_exec<64>>(RAJA::RangeSegment(0,64),
-      [=] RAJA_HOST_DEVICE (int ){
-        body();
-      });
+    RAJA::forall<RAJA::hip_exec<64>>(RAJA::RangeSegment(0, 64),
+                                     [=] RAJA_HOST_DEVICE(int) { body(); });
 
-      hipDeviceSynchronize();
-
-    }
+    hipDeviceSynchronize();
+  }
 
-    static constexpr bool is_device = true;
+  static constexpr bool is_device = true;
 };
 #endif
 
 
-
-template<typename POL, typename BODY>
-void tensor_do(BODY const &body){
+template <typename POL, typename BODY>
+void tensor_do(BODY const& body)
+{
   TensorTestHelper<POL>::exec(body);
 }
 
 
-
 #if defined(RAJA_ENABLE_CUDA)
 
-template<typename POL, typename T>
-T* tensor_malloc(size_t len){
-  if(TensorTestHelper<POL>::is_device){
-    T *ptr;
+template <typename POL, typename T>
+T* tensor_malloc(size_t len)
+{
+  if (TensorTestHelper<POL>::is_device)
+  {
+    T* ptr;
 
-    cudaErrchk(cudaMalloc(&ptr, len*sizeof(T)));
+    cudaErrchk(cudaMalloc(&ptr, len * sizeof(T)));
 
     return ptr;
   }
-  else{
+  else
+  {
     return new T[len];
   }
 }
 
-template<typename POL, typename T>
-void tensor_free(T *ptr){
-  if(TensorTestHelper<POL>::is_device){
+template <typename POL, typename T>
+void tensor_free(T* ptr)
+{
+  if (TensorTestHelper<POL>::is_device)
+  {
     cudaErrchk(cudaFree(ptr));
   }
-  else{
+  else
+  {
     delete[] ptr;
   }
 }
 
-template<typename POL, typename T>
-void tensor_copy_to_device(T *d_ptr, std::vector<T> const &h_vec){
-  if(TensorTestHelper<POL>::is_device){
-    cudaErrchk(cudaMemcpy(d_ptr, h_vec.data(), h_vec.size()*sizeof(T), cudaMemcpyHostToDevice));
+template <typename POL, typename T>
+void tensor_copy_to_device(T* d_ptr, std::vector<T> const& h_vec)
+{
+  if (TensorTestHelper<POL>::is_device)
+  {
+    cudaErrchk(cudaMemcpy(d_ptr, h_vec.data(), h_vec.size() * sizeof(T),
+                          cudaMemcpyHostToDevice));
   }
-  else{
-    memcpy(d_ptr, h_vec.data(), h_vec.size()*sizeof(T));
+  else
+  {
+    memcpy(d_ptr, h_vec.data(), h_vec.size() * sizeof(T));
   }
 }
 
-template<typename POL, typename T>
-void tensor_copy_to_host(std::vector<T> &h_vec, T const *d_ptr){
-  if(TensorTestHelper<POL>::is_device){
-    cudaErrchk(cudaMemcpy(h_vec.data(), d_ptr, h_vec.size()*sizeof(T), cudaMemcpyDeviceToHost));
+template <typename POL, typename T>
+void tensor_copy_to_host(std::vector<T>& h_vec, T const* d_ptr)
+{
+  if (TensorTestHelper<POL>::is_device)
+  {
+    cudaErrchk(cudaMemcpy(h_vec.data(), d_ptr, h_vec.size() * sizeof(T),
+                          cudaMemcpyDeviceToHost));
   }
-  else{
-    memcpy(h_vec.data(), d_ptr, h_vec.size()*sizeof(T));
+  else
+  {
+    memcpy(h_vec.data(), d_ptr, h_vec.size() * sizeof(T));
   }
 }
 
 
-
 #elif defined(RAJA_ENABLE_HIP)
 
 
-template<typename POL, typename T>
-T* tensor_malloc(size_t len){
-  if(TensorTestHelper<POL>::is_device){
-    T *ptr;
+template <typename POL, typename T>
+T* tensor_malloc(size_t len)
+{
+  if (TensorTestHelper<POL>::is_device)
+  {
+    T* ptr;
 
-    hipErrchk(hipMalloc(&ptr, len*sizeof(T)));
+    hipErrchk(hipMalloc(&ptr, len * sizeof(T)));
 
     return ptr;
   }
-  else{
+  else
+  {
     return new T[len];
   }
 }
 
-template<typename POL, typename T>
-void tensor_free(T *ptr){
-  if(TensorTestHelper<POL>::is_device){
+template <typename POL, typename T>
+void tensor_free(T* ptr)
+{
+  if (TensorTestHelper<POL>::is_device)
+  {
     hipErrchk(hipFree(ptr));
   }
-  else{
+  else
+  {
     delete[] ptr;
   }
 }
 
-template<typename POL, typename T>
-void tensor_copy_to_device(T *d_ptr, std::vector<T> const &h_vec){
-  if(TensorTestHelper<POL>::is_device){
-    hipErrchk(hipMemcpy(d_ptr, h_vec.data(), h_vec.size()*sizeof(T), hipMemcpyHostToDevice));
+template <typename POL, typename T>
+void tensor_copy_to_device(T* d_ptr, std::vector<T> const& h_vec)
+{
+  if (TensorTestHelper<POL>::is_device)
+  {
+    hipErrchk(hipMemcpy(d_ptr, h_vec.data(), h_vec.size() * sizeof(T),
+                        hipMemcpyHostToDevice));
   }
-  else{
-    memcpy(d_ptr, h_vec.data(), h_vec.size()*sizeof(T));
+  else
+  {
+    memcpy(d_ptr, h_vec.data(), h_vec.size() * sizeof(T));
   }
 }
 
-template<typename POL, typename T>
-void tensor_copy_to_host(std::vector<T> &h_vec, T const *d_ptr){
-  if(TensorTestHelper<POL>::is_device){
-    hipErrchk(hipMemcpy(h_vec.data(), d_ptr, h_vec.size()*sizeof(T), hipMemcpyDeviceToHost));
+template <typename POL, typename T>
+void tensor_copy_to_host(std::vector<T>& h_vec, T const* d_ptr)
+{
+  if (TensorTestHelper<POL>::is_device)
+  {
+    hipErrchk(hipMemcpy(h_vec.data(), d_ptr, h_vec.size() * sizeof(T),
+                        hipMemcpyDeviceToHost));
   }
-  else{
-    memcpy(h_vec.data(), d_ptr, h_vec.size()*sizeof(T));
+  else
+  {
+    memcpy(h_vec.data(), d_ptr, h_vec.size() * sizeof(T));
   }
 }
 
 
 #else
 
-template<typename POL, typename T>
-T* tensor_malloc(size_t len){
+template <typename POL, typename T>
+T* tensor_malloc(size_t len)
+{
   return new T[len];
 }
 
-template<typename POL, typename T>
-void tensor_free(T *ptr){
+template <typename POL, typename T>
+void tensor_free(T* ptr)
+{
   delete[] ptr;
 }
 
-template<typename POL, typename T>
-void tensor_copy_to_device(T *d_ptr, std::vector<T> const &h_vec){
-  memcpy(d_ptr, h_vec.data(), h_vec.size()*sizeof(T));
+template <typename POL, typename T>
+void tensor_copy_to_device(T* d_ptr, std::vector<T> const& h_vec)
+{
+  memcpy(d_ptr, h_vec.data(), h_vec.size() * sizeof(T));
 }
 
-template<typename POL, typename T>
-void tensor_copy_to_host(std::vector<T> &h_vec, T const *d_ptr){
-  memcpy(h_vec.data(), d_ptr, h_vec.size()*sizeof(T));
+template <typename POL, typename T>
+void tensor_copy_to_host(std::vector<T>& h_vec, T const* d_ptr)
+{
+  memcpy(h_vec.data(), d_ptr, h_vec.size() * sizeof(T));
 }
 
 #endif
 
 
-
 // Sugar to make things cleaner
-template<typename POL, typename T>
-T* tensor_malloc(std::vector<T> const &vec){
-  return tensor_malloc<POL,T>(vec.size());
+template <typename POL, typename T>
+T* tensor_malloc(std::vector<T> const& vec)
+{
+  return tensor_malloc<POL, T>(vec.size());
 }
 
 
-
-
 #endif
diff --git a/test/include/RAJA_test-workgroup.hpp b/test/include/RAJA_test-workgroup.hpp
index 77042a43e1..520337103a 100644
--- a/test/include/RAJA_test-workgroup.hpp
+++ b/test/include/RAJA_test-workgroup.hpp
@@ -18,28 +18,32 @@
 #include <new>
 #include <unordered_map>
 
-namespace detail {
+namespace detail
+{
 
-struct indirect_function_call_dispatch_typer {
-  template < typename ... >
+struct indirect_function_call_dispatch_typer
+{
+  template <typename...>
   using type = ::RAJA::indirect_function_call_dispatch;
 };
 
-struct indirect_virtual_function_dispatch_typer {
-  template < typename ... >
+struct indirect_virtual_function_dispatch_typer
+{
+  template <typename...>
   using type = ::RAJA::indirect_virtual_function_dispatch;
 };
 
-struct direct_dispatch_typer {
-  template < typename ... Ts >
+struct direct_dispatch_typer
+{
+  template <typename... Ts>
   using type = ::RAJA::direct_dispatch<Ts...>;
 };
 
 
-template < typename Resource >
+template <typename Resource>
 struct ResourceAllocator
 {
-  template < typename T >
+  template <typename T>
   struct std_allocator
   {
     using value_type = T;
@@ -47,26 +51,29 @@ struct ResourceAllocator
     std_allocator() = default;
 
     std_allocator(std_allocator const&) = default;
-    std_allocator(std_allocator &&) = default;
+    std_allocator(std_allocator&&)      = default;
 
     std_allocator& operator=(std_allocator const&) = default;
-    std_allocator& operator=(std_allocator &&) = default;
+    std_allocator& operator=(std_allocator&&)      = default;
 
-    template < typename U >
+    template <typename U>
     std_allocator(std_allocator<U> const& other) noexcept
-      : m_res(other.get_resource())
-    { }
+        : m_res(other.get_resource())
+    {}
 
     /*[[nodiscard]]*/
     value_type* allocate(size_t num)
     {
-      if (num > std::numeric_limits<size_t>::max() / sizeof(value_type)) {
+      if (num > std::numeric_limits<size_t>::max() / sizeof(value_type))
+      {
         throw std::bad_alloc();
       }
 
-      value_type* ptr = m_res.template allocate<value_type>(num, camp::resources::MemoryAccess::Pinned);
+      value_type* ptr = m_res.template allocate<value_type>(
+          num, camp::resources::MemoryAccess::Pinned);
 
-      if (!ptr) {
+      if (!ptr)
+      {
         throw std::bad_alloc();
       }
 
@@ -78,19 +85,19 @@ struct ResourceAllocator
       m_res.deallocate(ptr, camp::resources::MemoryAccess::Pinned);
     }
 
-    Resource const& get_resource() const
-    {
-      return m_res;
-    }
+    Resource const& get_resource() const { return m_res; }
 
     template <typename U>
-    friend inline bool operator==(std_allocator const& /*lhs*/, std_allocator<U> const& /*rhs*/)
+    friend inline bool operator==(std_allocator const& /*lhs*/,
+                                  std_allocator<U> const& /*rhs*/)
     {
-      return true; // lhs.get_resource() == rhs.get_resource(); // TODO not equality comparable yet
+      return true;  // lhs.get_resource() == rhs.get_resource(); // TODO not
+                    // equality comparable yet
     }
 
     template <typename U>
-    friend inline bool operator!=(std_allocator const& lhs, std_allocator<U> const& rhs)
+    friend inline bool operator!=(std_allocator const& lhs,
+                                  std_allocator<U> const& rhs)
     {
       return !(lhs == rhs);
     }
@@ -104,24 +111,25 @@ struct NeverEqualAllocator
 {
   using propagate_on_container_copy_assignment = std::false_type;
   using propagate_on_container_move_assignment = std::false_type;
-  using propagate_on_container_swap = std::true_type;
+  using propagate_on_container_swap            = std::true_type;
 
   NeverEqualAllocator() = default;
 
   NeverEqualAllocator(NeverEqualAllocator const&) = default;
-  NeverEqualAllocator(NeverEqualAllocator &&) = default;
+  NeverEqualAllocator(NeverEqualAllocator&&)      = default;
 
   NeverEqualAllocator& operator=(NeverEqualAllocator const&) = default;
-  NeverEqualAllocator& operator=(NeverEqualAllocator &&) = default;
+  NeverEqualAllocator& operator=(NeverEqualAllocator&&)      = default;
 
   NeverEqualAllocator select_on_container_copy_construction()
   {
-    return NeverEqualAllocator{};
+    return NeverEqualAllocator {};
   }
 
   ~NeverEqualAllocator()
   {
-    if (!m_allocations.empty()) {
+    if (!m_allocations.empty())
+    {
       RAJA_ABORT_OR_THROW("allocation map not empty at destruction");
     }
   }
@@ -129,9 +137,10 @@ struct NeverEqualAllocator
   /*[[nodiscard]]*/
   void* allocate(size_t size)
   {
-    void* ptr = malloc(size);
+    void* ptr   = malloc(size);
     auto iter_b = m_allocations.emplace(ptr, size);
-    if (!iter_b.second) {
+    if (!iter_b.second)
+    {
       RAJA_ABORT_OR_THROW("failed to add allocation to map");
     }
     return ptr;
@@ -140,20 +149,19 @@ struct NeverEqualAllocator
   void deallocate(void* ptr, size_t size) noexcept
   {
     auto iter = m_allocations.find(ptr);
-    if (iter == m_allocations.end()) {
+    if (iter == m_allocations.end())
+    {
       RAJA_ABORT_OR_THROW("failed to find allocation in map");
     }
-    if (iter->second != size) {
+    if (iter->second != size)
+    {
       RAJA_ABORT_OR_THROW("allocation size does not match known in map");
     }
     m_allocations.erase(iter);
     free(ptr);
   }
 
-  bool operator==(NeverEqualAllocator const&) const
-  {
-    return false;
-  }
+  bool operator==(NeverEqualAllocator const&) const { return false; }
 
 private:
   std::unordered_map<void*, size_t> m_allocations;
@@ -163,36 +171,27 @@ struct AlwaysEqualAllocator
 {
   using propagate_on_container_copy_assignment = std::false_type;
   using propagate_on_container_move_assignment = std::false_type;
-  using propagate_on_container_swap = std::false_type;
+  using propagate_on_container_swap            = std::false_type;
 
   AlwaysEqualAllocator() = default;
 
   AlwaysEqualAllocator(AlwaysEqualAllocator const&) = default;
-  AlwaysEqualAllocator(AlwaysEqualAllocator &&) = default;
+  AlwaysEqualAllocator(AlwaysEqualAllocator&&)      = default;
 
   AlwaysEqualAllocator& operator=(AlwaysEqualAllocator const&) = default;
-  AlwaysEqualAllocator& operator=(AlwaysEqualAllocator &&) = default;
+  AlwaysEqualAllocator& operator=(AlwaysEqualAllocator&&)      = default;
 
-  AlwaysEqualAllocator select_on_container_copy_construction()
-  {
-    return *this;
-  }
+  AlwaysEqualAllocator select_on_container_copy_construction() { return *this; }
 
   /*[[nodiscard]]*/
-  void* allocate(size_t size)
-  {
-    return get_allocator().allocate(size);
-  }
+  void* allocate(size_t size) { return get_allocator().allocate(size); }
 
   void deallocate(void* ptr, size_t size) noexcept
   {
     get_allocator().deallocate(ptr, size);
   }
 
-  bool operator==(AlwaysEqualAllocator const&) const
-  {
-    return true;
-  }
+  bool operator==(AlwaysEqualAllocator const&) const { return true; }
 
 private:
   static inline NeverEqualAllocator& get_allocator()
@@ -206,50 +205,54 @@ struct PropogatingAllocator : NeverEqualAllocator
 {
   using propagate_on_container_copy_assignment = std::true_type;
   using propagate_on_container_move_assignment = std::true_type;
-  using propagate_on_container_swap = std::true_type;
+  using propagate_on_container_swap            = std::true_type;
 
   PropogatingAllocator() = default;
 
   PropogatingAllocator(PropogatingAllocator const&) = default;
-  PropogatingAllocator(PropogatingAllocator &&) = default;
+  PropogatingAllocator(PropogatingAllocator&&)      = default;
 
   PropogatingAllocator& operator=(PropogatingAllocator const&) = default;
-  PropogatingAllocator& operator=(PropogatingAllocator &&) = default;
+  PropogatingAllocator& operator=(PropogatingAllocator&&)      = default;
 
   PropogatingAllocator select_on_container_copy_construction()
   {
-    return PropogatingAllocator(NeverEqualAllocator::select_on_container_copy_construction());
+    return PropogatingAllocator(
+        NeverEqualAllocator::select_on_container_copy_construction());
   }
 
 private:
   PropogatingAllocator(NeverEqualAllocator&& nea)
-    : NeverEqualAllocator(std::move(nea))
-  { }
+      : NeverEqualAllocator(std::move(nea))
+  {}
 };
 
-template < typename AllocatorImpl >
+template <typename AllocatorImpl>
 struct WorkStorageTestAllocator
 {
-  template < typename T >
+  template <typename T>
   struct std_allocator
   {
     using value_type = T;
-    using propagate_on_container_copy_assignment = typename AllocatorImpl::propagate_on_container_copy_assignment;
-    using propagate_on_container_move_assignment = typename AllocatorImpl::propagate_on_container_move_assignment;
-    using propagate_on_container_swap = typename AllocatorImpl::propagate_on_container_swap;
+    using propagate_on_container_copy_assignment =
+        typename AllocatorImpl::propagate_on_container_copy_assignment;
+    using propagate_on_container_move_assignment =
+        typename AllocatorImpl::propagate_on_container_move_assignment;
+    using propagate_on_container_swap =
+        typename AllocatorImpl::propagate_on_container_swap;
 
     std_allocator() = default;
 
     std_allocator(std_allocator const&) = default;
-    std_allocator(std_allocator &&) = default;
+    std_allocator(std_allocator&&)      = default;
 
     std_allocator& operator=(std_allocator const&) = default;
-    std_allocator& operator=(std_allocator &&) = default;
+    std_allocator& operator=(std_allocator&&)      = default;
 
-    template < typename U >
+    template <typename U>
     std_allocator(std_allocator<U> const& other) noexcept
-      : m_impl(other.get_impl())
-    { }
+        : m_impl(other.get_impl())
+    {}
 
     std_allocator select_on_container_copy_construction()
     {
@@ -259,13 +262,16 @@ struct WorkStorageTestAllocator
     /*[[nodiscard]]*/
     value_type* allocate(size_t num)
     {
-      if (num > std::numeric_limits<size_t>::max() / sizeof(value_type)) {
+      if (num > std::numeric_limits<size_t>::max() / sizeof(value_type))
+      {
         throw std::bad_alloc();
       }
 
-      value_type* ptr = static_cast<value_type*>(m_impl.allocate(num*sizeof(value_type)));
+      value_type* ptr =
+          static_cast<value_type*>(m_impl.allocate(num * sizeof(value_type)));
 
-      if (!ptr) {
+      if (!ptr)
+      {
         throw std::bad_alloc();
       }
 
@@ -274,130 +280,96 @@ struct WorkStorageTestAllocator
 
     void deallocate(value_type* ptr, size_t num) noexcept
     {
-      m_impl.deallocate(static_cast<void*>(ptr), num*sizeof(value_type));
+      m_impl.deallocate(static_cast<void*>(ptr), num * sizeof(value_type));
     }
 
-    AllocatorImpl const& get_impl() const
-    {
-      return m_impl;
-    }
+    AllocatorImpl const& get_impl() const { return m_impl; }
 
     template <typename U>
-    friend inline bool operator==(std_allocator const& lhs, std_allocator<U> const& rhs)
+    friend inline bool operator==(std_allocator const& lhs,
+                                  std_allocator<U> const& rhs)
     {
       return lhs.get_impl() == rhs.get_impl();
     }
 
     template <typename U>
-    friend inline bool operator!=(std_allocator const& lhs, std_allocator<U> const& rhs)
+    friend inline bool operator!=(std_allocator const& lhs,
+                                  std_allocator<U> const& rhs)
     {
       return !(lhs == rhs);
     }
 
   private:
-    std_allocator(AllocatorImpl&& impl)
-      : m_impl(std::move(impl))
-    { }
+    std_allocator(AllocatorImpl&& impl) : m_impl(std::move(impl)) {}
 
     AllocatorImpl m_impl;
   };
 };
 
-} // namespace detail
+}  // namespace detail
 
 
 //
 // Data types
 //
-using IndexTypeTypeList = camp::list<
-                                 int,
-                                 long,
-                                 RAJA::Index_type
-                               >;
-
-using XargsTypeList = camp::list<
-                                 RAJA::xargs<>,
-                                 RAJA::xargs<int*>,
-                                 RAJA::xargs<int, int*>
-                               >;
-
-using SequentialExecPolicyList =
-    camp::list<
-                RAJA::seq_work
-              >;
+using IndexTypeTypeList = camp::list<int, long, RAJA::Index_type>;
+
+using XargsTypeList =
+    camp::list<RAJA::xargs<>, RAJA::xargs<int*>, RAJA::xargs<int, int*>>;
+
+using SequentialExecPolicyList = camp::list<RAJA::seq_work>;
 using SequentialOrderedPolicyList =
-    camp::list<
-                RAJA::ordered,
-                RAJA::reverse_ordered
-              >;
+    camp::list<RAJA::ordered, RAJA::reverse_ordered>;
 using SequentialOrderPolicyList =
-    camp::list<
-                RAJA::ordered,
-                RAJA::reverse_ordered
-              >;
+    camp::list<RAJA::ordered, RAJA::reverse_ordered>;
 using SequentialStoragePolicyList =
-    camp::list<
-                RAJA::array_of_pointers,
-                RAJA::ragged_array_of_objects,
-                RAJA::constant_stride_array_of_objects
-              >;
+    camp::list<RAJA::array_of_pointers,
+               RAJA::ragged_array_of_objects,
+               RAJA::constant_stride_array_of_objects>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPExecPolicyList =
-    camp::list<
-                RAJA::omp_work
-              >;
+using OpenMPExecPolicyList    = camp::list<RAJA::omp_work>;
 using OpenMPOrderedPolicyList = SequentialOrderedPolicyList;
 using OpenMPOrderPolicyList   = SequentialOrderPolicyList;
 using OpenMPStoragePolicyList = SequentialStoragePolicyList;
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetExecPolicyList =
-    camp::list<
-                RAJA::omp_target_work
-              >;
+using OpenMPTargetExecPolicyList    = camp::list<RAJA::omp_target_work>;
 using OpenMPTargetOrderedPolicyList = SequentialOrderedPolicyList;
 using OpenMPTargetOrderPolicyList   = SequentialOrderPolicyList;
 using OpenMPTargetStoragePolicyList = SequentialStoragePolicyList;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaExecPolicyList =
-    camp::list<
-                #if defined(RAJA_TEST_EXHAUSTIVE)
-                // avoid compilation error:
-                // tpl/camp/include/camp/camp.hpp(104): error #456: excessive recursion at instantiation of class
-                RAJA::cuda_work<256>,
-                #endif
-                RAJA::cuda_work<1024>,
-                RAJA::cuda_work_explicit<256, 2>
-              >;
+using CudaExecPolicyList = camp::list<
+#if defined(RAJA_TEST_EXHAUSTIVE)
+    // avoid compilation error:
+    // tpl/camp/include/camp/camp.hpp(104): error #456: excessive recursion at
+    // instantiation of class
+    RAJA::cuda_work<256>,
+#endif
+    RAJA::cuda_work<1024>,
+    RAJA::cuda_work_explicit<256, 2>>;
 using CudaOrderedPolicyList = SequentialOrderedPolicyList;
-using CudaOrderPolicyList   =
-    camp::list<
-                RAJA::ordered,
-                RAJA::reverse_ordered,
-                RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average
-              >;
+using CudaOrderPolicyList =
+    camp::list<RAJA::ordered,
+               RAJA::reverse_ordered,
+               RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average>;
 using CudaStoragePolicyList = SequentialStoragePolicyList;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipExecPolicyList =
-    camp::list<
-                #if defined(RAJA_TEST_EXHAUSTIVE)
-                RAJA::hip_work<256>,
-                #endif
-                RAJA::hip_work<1024>
-              >;
+using HipExecPolicyList = camp::list<
+#if defined(RAJA_TEST_EXHAUSTIVE)
+    RAJA::hip_work<256>,
+#endif
+    RAJA::hip_work<1024>>;
 using HipOrderedPolicyList = SequentialOrderedPolicyList;
-using HipOrderPolicyList   =
-    camp::list<
-                RAJA::ordered,
-                RAJA::reverse_ordered
-              , RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average
-              >;
+using HipOrderPolicyList =
+    camp::list<RAJA::ordered,
+               RAJA::reverse_ordered,
+               RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average>;
 using HipStoragePolicyList = SequentialStoragePolicyList;
 #endif
 
@@ -405,15 +377,18 @@ using HipStoragePolicyList = SequentialStoragePolicyList;
 //
 // Dispatch policy type lists, broken up for compile time reasons
 //
-using IndirectFunctionDispatchTyperList = camp::list<detail::indirect_function_call_dispatch_typer>;
-using IndirectVirtualDispatchTyperList = camp::list<detail::indirect_virtual_function_dispatch_typer>;
+using IndirectFunctionDispatchTyperList =
+    camp::list<detail::indirect_function_call_dispatch_typer>;
+using IndirectVirtualDispatchTyperList =
+    camp::list<detail::indirect_virtual_function_dispatch_typer>;
 using DirectDispatchTyperList = camp::list<detail::direct_dispatch_typer>;
 
 
 //
 // Memory resource Allocator types
 //
-using HostAllocatorList = camp::list<typename detail::ResourceAllocator<camp::resources::Host>::template std_allocator<char>>;
+using HostAllocatorList = camp::list<typename detail::ResourceAllocator<
+    camp::resources::Host>::template std_allocator<char>>;
 
 using SequentialAllocatorList = HostAllocatorList;
 
@@ -422,23 +397,30 @@ using OpenMPAllocatorList = HostAllocatorList;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaAllocatorList = camp::list<typename detail::ResourceAllocator<camp::resources::Cuda>::template std_allocator<char>>;
+using CudaAllocatorList = camp::list<typename detail::ResourceAllocator<
+    camp::resources::Cuda>::template std_allocator<char>>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipAllocatorList = camp::list<typename detail::ResourceAllocator<camp::resources::Hip>::template std_allocator<char>>;
+using HipAllocatorList = camp::list<typename detail::ResourceAllocator<
+    camp::resources::Hip>::template std_allocator<char>>;
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetAllocatorList = camp::list<typename detail::ResourceAllocator<camp::resources::Omp>::template std_allocator<char>>;
+using OpenMPTargetAllocatorList = camp::list<typename detail::ResourceAllocator<
+    camp::resources::Omp>::template std_allocator<char>>;
 #endif
 
 
 //
 // Memory resource types for testing different std allocator requirements
 //
-using WorkStorageAllocatorList = camp::list<typename detail::WorkStorageTestAllocator<detail::AlwaysEqualAllocator>::template std_allocator<char>,
-                                            typename detail::WorkStorageTestAllocator<detail::NeverEqualAllocator>::template std_allocator<char>,
-                                            typename detail::WorkStorageTestAllocator<detail::PropogatingAllocator>::template std_allocator<char>>;
+using WorkStorageAllocatorList =
+    camp::list<typename detail::WorkStorageTestAllocator<
+                   detail::AlwaysEqualAllocator>::template std_allocator<char>,
+               typename detail::WorkStorageTestAllocator<
+                   detail::NeverEqualAllocator>::template std_allocator<char>,
+               typename detail::WorkStorageTestAllocator<
+                   detail::PropogatingAllocator>::template std_allocator<char>>;
 
 #endif  // __TEST_WORKGROUP_UTILS_HPP__
diff --git a/test/include/RAJA_unit-test-for3d3d.hpp b/test/include/RAJA_unit-test-for3d3d.hpp
index a2c43ec55e..2297745b8c 100644
--- a/test/include/RAJA_unit-test-for3d3d.hpp
+++ b/test/include/RAJA_unit-test-for3d3d.hpp
@@ -40,12 +40,15 @@ struct dim3d3d
 RAJA_HOST_DEVICE
 int index(dim3d3d idx, dim3d3d dim)
 {
-  return               idx.thread[0] +
-      dim.thread[0] * (idx.thread[1] +
-      dim.thread[1] * (idx.thread[2] +
-      dim.thread[2] * (idx.block[0] +
-      dim.block[0]  * (idx.block[1] +
-      dim.block[1]  * (idx.block[2])))));
+  return idx.thread[0] +
+         dim.thread[0] *
+             (idx.thread[1] +
+              dim.thread[1] *
+                  (idx.thread[2] +
+                   dim.thread[2] *
+                       (idx.block[0] +
+                        dim.block[0] *
+                            (idx.block[1] + dim.block[1] * (idx.block[2])))));
 }
 
 ///
@@ -56,41 +59,61 @@ int index(dim3d3d idx, dim3d3d dim)
 ///   /* code to test */
 /// } );
 ///
-template < typename test_policy, typename L >
+template <typename test_policy, typename L>
 inline void for3d3d(dim3d3d dim, L&& run);
 
 // test_seq implementation
-template < typename L >
+template <typename L>
 inline void for3d3d(test_seq, dim3d3d dim, L&& run)
 {
-  for (int bz = 0; bz < dim.block[2]; ++bz) {
-  for (int by = 0; by < dim.block[1]; ++by) {
-  for (int bx = 0; bx < dim.block[0]; ++bx) {
-    for (int tz = 0; tz < dim.thread[2]; ++tz) {
-    for (int ty = 0; ty < dim.thread[1]; ++ty) {
-    for (int tx = 0; tx < dim.thread[0]; ++tx) {
-      run(dim3d3d{{tx,ty,tz}, {bx,by,bz}}, dim);
-    }}}
-  }}}
+  for (int bz = 0; bz < dim.block[2]; ++bz)
+  {
+    for (int by = 0; by < dim.block[1]; ++by)
+    {
+      for (int bx = 0; bx < dim.block[0]; ++bx)
+      {
+        for (int tz = 0; tz < dim.thread[2]; ++tz)
+        {
+          for (int ty = 0; ty < dim.thread[1]; ++ty)
+          {
+            for (int tx = 0; tx < dim.thread[0]; ++tx)
+            {
+              run(dim3d3d {{tx, ty, tz}, {bx, by, bz}}, dim);
+            }
+          }
+        }
+      }
+    }
+  }
 }
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
 // test_openmp_target implementation
-template < typename L >
+template <typename L>
 inline void for3d3d(test_openmp_target, dim3d3d dim, L&& run)
 {
 #pragma omp target teams distribute collapse(3)
-  for (int bz = 0; bz < dim.block[2]; ++bz) {
-  for (int by = 0; by < dim.block[1]; ++by) {
-  for (int bx = 0; bx < dim.block[0]; ++bx) {
+  for (int bz = 0; bz < dim.block[2]; ++bz)
+  {
+    for (int by = 0; by < dim.block[1]; ++by)
+    {
+      for (int bx = 0; bx < dim.block[0]; ++bx)
+      {
 #pragma omp parallel for collapse(3)
-    for (int tz = 0; tz < dim.thread[2]; ++tz) {
-    for (int ty = 0; ty < dim.thread[1]; ++ty) {
-    for (int tx = 0; tx < dim.thread[0]; ++tx) {
-      run(dim3d3d{{tx,ty,tz}, {bx,by,bz}}, dim);
-    }}}
-  }}}
+        for (int tz = 0; tz < dim.thread[2]; ++tz)
+        {
+          for (int ty = 0; ty < dim.thread[1]; ++ty)
+          {
+            for (int tx = 0; tx < dim.thread[0]; ++tx)
+            {
+              run(dim3d3d {{tx, ty, tz}, {bx, by, bz}}, dim);
+            }
+          }
+        }
+      }
+    }
+  }
 }
 
 #endif
@@ -100,20 +123,25 @@ inline void for3d3d(test_openmp_target, dim3d3d dim, L&& run)
 template <typename L>
 __global__ void for3d3d_cuda_global(L run)
 {
-  run(dim3d3d{{static_cast<int>(threadIdx.x), static_cast<int>(threadIdx.y), static_cast<int>(threadIdx.z)},
-              {static_cast<int>(blockIdx.x), static_cast<int>(blockIdx.y), static_cast<int>(blockIdx.z)}},
-      dim3d3d{{static_cast<int>(blockDim.x), static_cast<int>(blockDim.y), static_cast<int>(blockDim.z)},
-              {static_cast<int>(gridDim.x), static_cast<int>(gridDim.y), static_cast<int>(gridDim.z)}});
+  run(dim3d3d {{static_cast<int>(threadIdx.x), static_cast<int>(threadIdx.y),
+                static_cast<int>(threadIdx.z)},
+               {static_cast<int>(blockIdx.x), static_cast<int>(blockIdx.y),
+                static_cast<int>(blockIdx.z)}},
+      dim3d3d {{static_cast<int>(blockDim.x), static_cast<int>(blockDim.y),
+                static_cast<int>(blockDim.z)},
+               {static_cast<int>(gridDim.x), static_cast<int>(gridDim.y),
+                static_cast<int>(gridDim.z)}});
 }
 
 // test_cuda implementation
-template < typename L >
+template <typename L>
 inline void for3d3d(test_cuda, dim3d3d dim, L&& run)
 {
-   for3d3d_cuda_global<<<dim3(dim.block[0], dim.block[1], dim.block[2]),
-                         dim3(dim.thread[0], dim.thread[1], dim.thread[2])>>>(std::forward<L>(run));
-   cudaErrchk(cudaGetLastError());
-   cudaErrchk(cudaDeviceSynchronize());
+  for3d3d_cuda_global<<<dim3(dim.block[0], dim.block[1], dim.block[2]),
+                        dim3(dim.thread[0], dim.thread[1], dim.thread[2])>>>(
+      std::forward<L>(run));
+  cudaErrchk(cudaGetLastError());
+  cudaErrchk(cudaDeviceSynchronize());
 }
 
 #endif
@@ -123,31 +151,34 @@ inline void for3d3d(test_cuda, dim3d3d dim, L&& run)
 template <typename L>
 __global__ void for3d3d_hip_global(L run)
 {
-  run(dim3d3d{{static_cast<int>(threadIdx.x), static_cast<int>(threadIdx.y), static_cast<int>(threadIdx.z)},
-              {static_cast<int>(blockIdx.x), static_cast<int>(blockIdx.y), static_cast<int>(blockIdx.z)}},
-      dim3d3d{{static_cast<int>(blockDim.x), static_cast<int>(blockDim.y), static_cast<int>(blockDim.z)},
-              {static_cast<int>(gridDim.x), static_cast<int>(gridDim.y), static_cast<int>(gridDim.z)}});
+  run(dim3d3d {{static_cast<int>(threadIdx.x), static_cast<int>(threadIdx.y),
+                static_cast<int>(threadIdx.z)},
+               {static_cast<int>(blockIdx.x), static_cast<int>(blockIdx.y),
+                static_cast<int>(blockIdx.z)}},
+      dim3d3d {{static_cast<int>(blockDim.x), static_cast<int>(blockDim.y),
+                static_cast<int>(blockDim.z)},
+               {static_cast<int>(gridDim.x), static_cast<int>(gridDim.y),
+                static_cast<int>(gridDim.z)}});
 }
 
 // test_hip implementation
-template < typename L >
+template <typename L>
 inline void for3d3d(test_hip, dim3d3d dim, L&& run)
 {
-   hipLaunchKernelGGL(for3d3d_hip_global<camp::decay<L>>,
-                      dim3(dim.block[0], dim.block[1], dim.block[2]),
-                      dim3(dim.thread[0], dim.thread[1], dim.thread[2]),
-                      0, 0,
-                      std::forward<L>(run));
-   hipErrchk(hipGetLastError());
-   hipErrchk(hipDeviceSynchronize());
+  hipLaunchKernelGGL(for3d3d_hip_global<camp::decay<L>>,
+                     dim3(dim.block[0], dim.block[1], dim.block[2]),
+                     dim3(dim.thread[0], dim.thread[1], dim.thread[2]), 0, 0,
+                     std::forward<L>(run));
+  hipErrchk(hipGetLastError());
+  hipErrchk(hipDeviceSynchronize());
 }
 
 #endif
 
-template < typename test_policy, typename L >
+template <typename test_policy, typename L>
 void for3d3d(dim3d3d dim, L&& run)
 {
-  for3d3d(test_policy{}, dim, std::forward<L>(run));
+  for3d3d(test_policy {}, dim, std::forward<L>(run));
 }
 
-#endif // RAJA_test_for3d3d_HPP__
+#endif  // RAJA_test_for3d3d_HPP__
diff --git a/test/include/RAJA_unit-test-forone.hpp b/test/include/RAJA_unit-test-forone.hpp
index 4e9fc521e4..d5315c6e1c 100644
--- a/test/include/RAJA_unit-test-forone.hpp
+++ b/test/include/RAJA_unit-test-forone.hpp
@@ -18,11 +18,11 @@
 ///
 /// forone<test_policy>( [=] RAJA_HOST_DEVICE(){ /* code to test */ } );
 ///
-template < typename test_policy, typename L >
+template <typename test_policy, typename L>
 inline void forone(L&& run);
 
 // test_seq implementation
-template < typename L >
+template <typename L>
 inline void forone(test_seq, L&& run)
 {
   std::forward<L>(run)();
@@ -31,7 +31,7 @@ inline void forone(test_seq, L&& run)
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
 // test_openmp_target implementation
-template < typename L >
+template <typename L>
 inline void forone(test_openmp_target, L&& run)
 {
 #pragma omp target
@@ -49,12 +49,12 @@ __global__ void forone_cuda_global(L run)
 }
 
 // test_cuda implementation
-template < typename L >
+template <typename L>
 inline void forone(test_cuda, L&& run)
 {
-   forone_cuda_global<<<1,1>>>(std::forward<L>(run));
-   cudaErrchk(cudaGetLastError());
-   cudaErrchk(cudaDeviceSynchronize());
+  forone_cuda_global<<<1, 1>>>(std::forward<L>(run));
+  cudaErrchk(cudaGetLastError());
+  cudaErrchk(cudaDeviceSynchronize());
 }
 
 #endif
@@ -68,20 +68,21 @@ __global__ void forone_hip_global(L run)
 }
 
 // test_hip implementation
-template < typename L >
+template <typename L>
 inline void forone(test_hip, L&& run)
 {
-   hipLaunchKernelGGL(forone_hip_global<camp::decay<L>>, dim3(1), dim3(1), 0, 0, std::forward<L>(run));
-   hipErrchk(hipGetLastError());
-   hipErrchk(hipDeviceSynchronize());
+  hipLaunchKernelGGL(forone_hip_global<camp::decay<L>>, dim3(1), dim3(1), 0, 0,
+                     std::forward<L>(run));
+  hipErrchk(hipGetLastError());
+  hipErrchk(hipDeviceSynchronize());
 }
 
 #endif
 
-template < typename test_policy, typename L >
+template <typename test_policy, typename L>
 void forone(L&& run)
 {
-  forone(test_policy{}, std::forward<L>(run));
+  forone(test_policy {}, std::forward<L>(run));
 }
 
-#endif // RAJA_test_forone_HPP__
+#endif  // RAJA_test_forone_HPP__
diff --git a/test/include/RAJA_unit-test-policy.hpp b/test/include/RAJA_unit-test-policy.hpp
index e0aa1f8c65..2fa89acc58 100644
--- a/test/include/RAJA_unit-test-policy.hpp
+++ b/test/include/RAJA_unit-test-policy.hpp
@@ -20,48 +20,59 @@
 
 
 // base classes to represent host or device in exec_dispatcher
-struct RunOnHost {};
-struct RunOnDevice {};
+struct RunOnHost
+{};
+struct RunOnDevice
+{};
 
 // sequential test policy
-struct test_seq : public RunOnHost  { };
+struct test_seq : public RunOnHost
+{};
 
 // struct with specializations containing information about test policies
-template < typename test_policy >
+template <typename test_policy>
 struct test_policy_info;
 
 // alias for equivalent RAJA exec policy to given test policy
-template < typename test_policy >
-using test_equivalent_exec_policy = typename test_policy_info<test_policy>::type;
+template <typename test_policy>
+using test_equivalent_exec_policy =
+    typename test_policy_info<test_policy>::type;
 
 // alias for platform of given test policy
-template < typename test_policy >
+template <typename test_policy>
 using test_platform = typename test_policy_info<test_policy>::platform;
 
 // alias for platform of given test policy
-template < typename test_policy >
+template <typename test_policy>
 using test_resource = typename test_policy_info<test_policy>::resource;
 
-template < typename test_policy >
+template <typename test_policy>
 test_resource<test_policy> get_test_resource()
 {
   return test_resource<test_policy>::get_default();
 }
 
-template < typename dst_resource, typename src_resource, typename T >
-inline T* test_reallocate(dst_resource dst_res, src_resource src_res, T* src, size_t len)
+template <typename dst_resource, typename src_resource, typename T>
+inline T*
+test_reallocate(dst_resource dst_res, src_resource src_res, T* src, size_t len)
 {
   T* dst = nullptr;
-  if (dst_res.get_platform() == camp::resources::Platform::host) {
+  if (dst_res.get_platform() == camp::resources::Platform::host)
+  {
     dst = dst_res.template allocate<T>(len);
-    src_res.memcpy(dst, src, len*sizeof(T));
+    src_res.memcpy(dst, src, len * sizeof(T));
     src_res.wait();
-  } else if (src_res.get_platform() == camp::resources::Platform::host) {
+  }
+  else if (src_res.get_platform() == camp::resources::Platform::host)
+  {
     dst = dst_res.template allocate<T>(len);
-    dst_res.memcpy(dst, src, len*sizeof(T));
+    dst_res.memcpy(dst, src, len * sizeof(T));
     dst_res.wait();
-  } else {
-    throw std::runtime_error("Expected source or destination resource to be host");
+  }
+  else
+  {
+    throw std::runtime_error(
+        "Expected source or destination resource to be host");
   }
   src_res.deallocate(src);
   return dst;
@@ -69,11 +80,11 @@ inline T* test_reallocate(dst_resource dst_res, src_resource src_res, T* src, si
 
 
 // test_seq policy information
-template < >
+template <>
 struct test_policy_info<test_seq>
 {
   using resource = camp::resources::Host;
-  using type = RAJA::seq_exec;
+  using type     = RAJA::seq_exec;
   using platform = RunOnHost;
   static const char* name() { return "test_seq"; }
 };
@@ -81,14 +92,15 @@ struct test_policy_info<test_seq>
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
 // cuda test policy
-struct test_openmp_target : public RunOnHost { };
+struct test_openmp_target : public RunOnHost
+{};
 
 // test_openmp_target policy information
-template < >
+template <>
 struct test_policy_info<test_openmp_target>
 {
   using resource = camp::resources::Omp;
-  using type = RAJA::omp_target_parallel_for_exec<1>;
+  using type     = RAJA::omp_target_parallel_for_exec<1>;
   using platform = RunOnHost;
   static const char* name() { return "test_openmp_target"; }
 };
@@ -98,14 +110,15 @@ struct test_policy_info<test_openmp_target>
 #if defined(RAJA_ENABLE_CUDA)
 
 // cuda test policy
-struct test_cuda : public RunOnDevice { };
+struct test_cuda : public RunOnDevice
+{};
 
 // test_cuda policy information
-template < >
+template <>
 struct test_policy_info<test_cuda>
 {
   using resource = camp::resources::Cuda;
-  using type = RAJA::cuda_exec<1>;
+  using type     = RAJA::cuda_exec<1>;
   using platform = RunOnDevice;
   static const char* name() { return "test_cuda"; }
 };
@@ -115,14 +128,15 @@ struct test_policy_info<test_cuda>
 #if defined(RAJA_ENABLE_HIP)
 
 // hip test policy
-struct test_hip : public RunOnDevice { };
+struct test_hip : public RunOnDevice
+{};
 
 // test_hip policy information
-template < >
+template <>
 struct test_policy_info<test_hip>
 {
   using resource = camp::resources::Hip;
-  using type = RAJA::hip_exec<1>;
+  using type     = RAJA::hip_exec<1>;
   using platform = RunOnDevice;
   static const char* name() { return "test_hip"; }
 };
@@ -151,4 +165,4 @@ using OpenMPTargetUnitTestPolicyList = camp::list<test_openmp_target>;
 using HipUnitTestPolicyList = camp::list<test_hip>;
 #endif
 
-#endif // RAJA_test_policy_HPP__
+#endif  // RAJA_test_policy_HPP__
diff --git a/test/include/RAJA_unit-test-types.hpp b/test/include/RAJA_unit-test-types.hpp
index bb65134534..618a625ede 100644
--- a/test/include/RAJA_unit-test-types.hpp
+++ b/test/include/RAJA_unit-test-types.hpp
@@ -34,42 +34,27 @@ using UnitIntegralTypes = ::testing::Types<char,
 // Expanded integral types used in RAJA index unit tests
 //
 #ifndef RAJA_UNIT_EXPANDED_INTEGRAL_TYPES
-  #define RAJA_UNIT_EXPANDED_INTEGRAL_TYPES \
-    RAJA::Index_type,    \
-    char,                \
-    unsigned char,       \
-    short,               \
-    unsigned short,      \
-    int,                 \
-    unsigned int,        \
-    long,                \
-    unsigned long,       \
-    long int,            \
-    unsigned long int,   \
-    long long,           \
-    unsigned long long
-#endif // RAJA_UNIT_EXPANDED_INTEGRAL_TYPES
+#define RAJA_UNIT_EXPANDED_INTEGRAL_TYPES                                      \
+  RAJA::Index_type, char, unsigned char, short, unsigned short, int,           \
+      unsigned int, long, unsigned long, long int, unsigned long int,          \
+      long long, unsigned long long
+#endif  // RAJA_UNIT_EXPANDED_INTEGRAL_TYPES
 
 #ifndef RAJA_UNIT_FLOAT_TYPES
 #ifndef __clang__
-  #define RAJA_UNIT_FLOAT_TYPES \
-    float,               \
-    double,              \
-    long double
+#define RAJA_UNIT_FLOAT_TYPES float, double, long double
 #else
-  #define RAJA_UNIT_FLOAT_TYPES \
-    float,               \
-    double
-#endif // __clang__
-#endif // FLOATING_TYPES
+#define RAJA_UNIT_FLOAT_TYPES float, double
+#endif  // __clang__
+#endif  // FLOATING_TYPES
 
-using UnitExpandedIntegralTypes = 
-  ::testing::Types<RAJA_UNIT_EXPANDED_INTEGRAL_TYPES>;
+using UnitExpandedIntegralTypes =
+    ::testing::Types<RAJA_UNIT_EXPANDED_INTEGRAL_TYPES>;
 
 using UnitFloatTypes = ::testing::Types<RAJA_UNIT_FLOAT_TYPES>;
 
-using UnitIntFloatTypes = 
-  ::testing::Types<RAJA_UNIT_EXPANDED_INTEGRAL_TYPES,RAJA_UNIT_FLOAT_TYPES>;
+using UnitIntFloatTypes =
+    ::testing::Types<RAJA_UNIT_EXPANDED_INTEGRAL_TYPES, RAJA_UNIT_FLOAT_TYPES>;
 
 //
 // Standard list of index types used in RAJA index unit tests
diff --git a/test/include/type_helper.hpp b/test/include/type_helper.hpp
index 3a4581c8a0..4cb2fd0975 100644
--- a/test/include/type_helper.hpp
+++ b/test/include/type_helper.hpp
@@ -30,7 +30,8 @@ template <typename S, typename T>
 struct type_cat;
 
 template <typename... Ss, typename... Ts>
-struct type_cat<std::tuple<Ss...>, std::tuple<Ts...>> {
+struct type_cat<std::tuple<Ss...>, std::tuple<Ts...>>
+{
   using type = std::tuple<Ss..., Ts...>;
 };
 
@@ -39,26 +40,30 @@ template <typename S, typename T>
 struct product;
 
 template <typename S, typename... Ss, typename... Ts>
-struct product<std::tuple<S, Ss...>, std::tuple<Ts...>> {
+struct product<std::tuple<S, Ss...>, std::tuple<Ts...>>
+{
   // the cartesian product of {S} and {Ts...}
   // is a list of pairs -- here: a std::tuple of 2-element std::tuples
   using S_cross_Ts = std::tuple<std::tuple<S, Ts>...>;
 
   // the cartesian product of {Ss...} and {Ts...} (computed recursively)
-  using Ss_cross_Ts = typename product<std::tuple<Ss...>, std::tuple<Ts...>>::type;
+  using Ss_cross_Ts =
+      typename product<std::tuple<Ss...>, std::tuple<Ts...>>::type;
 
   // concatenate both products
   using type = typename type_cat<S_cross_Ts, Ss_cross_Ts>::type;
 };
 
 template <typename... Ss, typename... Ts, typename... Smembers>
-struct product<std::tuple<std::tuple<Smembers...>, Ss...>, std::tuple<Ts...>> {
+struct product<std::tuple<std::tuple<Smembers...>, Ss...>, std::tuple<Ts...>>
+{
   // the cartesian product of {S} and {Ts...}
   // is a list of pairs -- here: a std::tuple of 2-element std::tuples
   using S_cross_Ts = std::tuple<std::tuple<Smembers..., Ts>...>;
 
   // the cartesian product of {Ss...} and {Ts...} (computed recursively)
-  using Ss_cross_Ts = typename product<std::tuple<Ss...>, std::tuple<Ts...>>::type;
+  using Ss_cross_Ts =
+      typename product<std::tuple<Ss...>, std::tuple<Ts...>>::type;
 
   // concatenate both products
   using type = typename type_cat<S_cross_Ts, Ss_cross_Ts>::type;
@@ -66,7 +71,8 @@ struct product<std::tuple<std::tuple<Smembers...>, Ss...>, std::tuple<Ts...>> {
 
 // end the recursion
 template <typename... Ts>
-struct product<std::tuple<>, std::tuple<Ts...>> {
+struct product<std::tuple<>, std::tuple<Ts...>>
+{
   using type = std::tuple<>;
 };
 }  // namespace types
@@ -78,12 +84,14 @@ template <typename...>
 struct concat;
 
 template <template <class...> class T, typename U>
-struct concat<T<U>> {
+struct concat<T<U>>
+{
   using type = U;
 };
 
 template <typename T>
-struct concat<T> {
+struct concat<T>
+{
   using type = T;
 };
 
@@ -91,7 +99,8 @@ template <template <class...> class T,
           class... Front,
           class... Next,
           class... Rest>
-struct concat<T<Front...>, T<Next...>, Rest...> {
+struct concat<T<Front...>, T<Next...>, Rest...>
+{
   using type = typename concat<T<Front..., Next...>, Rest...>::type;
 };
 
@@ -99,12 +108,14 @@ template <typename... Ts>
 using concat_t = typename concat<Ts...>::type;
 
 template <class T>
-struct collapse {
+struct collapse
+{
   using type = T;
 };
 
 template <template <class...> class T, class... U>
-struct collapse<T<T<U...>>> {
+struct collapse<T<T<U...>>>
+{
   using type = typename collapse<T<U...>>::type;
 };
 
@@ -115,7 +126,8 @@ template <template <class> class, class>
 struct apply;
 
 template <template <class...> class L, template <class> class Fn, class... Ts>
-struct apply<Fn, L<Ts...>> {
+struct apply<Fn, L<Ts...>>
+{
   using type = collapse_t<L<concat_t<Fn<Ts>...>>>;
 };
 
@@ -131,7 +143,8 @@ template <typename T>
 struct ForTesting;
 
 template <template <class...> class T, typename... Ts>
-struct ForTesting<T<Ts...>> {
+struct ForTesting<T<Ts...>>
+{
   using type = ::testing::Types<Ts...>;
 };
 }  // namespace detail
diff --git a/test/install/using-with-cmake/using-with-cmake.cpp b/test/install/using-with-cmake/using-with-cmake.cpp
index b748f316df..82d77e9d98 100644
--- a/test/install/using-with-cmake/using-with-cmake.cpp
+++ b/test/install/using-with-cmake/using-with-cmake.cpp
@@ -7,25 +7,23 @@
 #include "RAJA/RAJA.hpp"
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv)) 
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
 {
-  constexpr std::size_t N{1024};
+  constexpr std::size_t N {1024};
 
   double* a = new double[N];
   double* b = new double[N];
-  double c = 3.14159;
-  
-  for (std::size_t i = 0; i < N; i++) {
+  double c  = 3.14159;
+
+  for (std::size_t i = 0; i < N; i++)
+  {
     a[i] = 1.0;
     b[i] = 2.0;
   }
 
-  RAJA::forall<RAJA::seq_exec>(
-    RAJA::RangeSegment(0, N),
-    [=] RAJA_HOST_DEVICE (std::size_t i) {
-      a[i] += b[i] * c;
-    }
-  );
+  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, N),
+                               [=] RAJA_HOST_DEVICE(std::size_t i)
+                               { a[i] += b[i] * c; });
 
   delete[] a;
   delete[] b;
diff --git a/test/integration/plugin/plugin_to_test.cpp b/test/integration/plugin/plugin_to_test.cpp
index 8290804191..bda2f313a3 100644
--- a/test/integration/plugin/plugin_to_test.cpp
+++ b/test/integration/plugin/plugin_to_test.cpp
@@ -12,11 +12,11 @@
 
 #include "counter.hpp"
 
-class CounterPlugin :
-  public RAJA::util::PluginStrategy
+class CounterPlugin : public RAJA::util::PluginStrategy
 {
-  public:
-  void preCapture(const RAJA::util::PluginContext& p) override {
+public:
+  void preCapture(const RAJA::util::PluginContext& p) override
+  {
     ASSERT_NE(plugin_test_data, nullptr);
     ASSERT_NE(plugin_test_resource, nullptr);
 
@@ -30,7 +30,8 @@ class CounterPlugin :
     plugin_test_resource->memcpy(plugin_test_data, &data, sizeof(CounterData));
   }
 
-  void postCapture(const RAJA::util::PluginContext& p) override {
+  void postCapture(const RAJA::util::PluginContext& p) override
+  {
     ASSERT_NE(plugin_test_data, nullptr);
     ASSERT_NE(plugin_test_resource, nullptr);
 
@@ -44,7 +45,8 @@ class CounterPlugin :
     plugin_test_resource->memcpy(plugin_test_data, &data, sizeof(CounterData));
   }
 
-  void preLaunch(const RAJA::util::PluginContext& p) override {
+  void preLaunch(const RAJA::util::PluginContext& p) override
+  {
     ASSERT_NE(plugin_test_data, nullptr);
     ASSERT_NE(plugin_test_resource, nullptr);
 
@@ -58,7 +60,8 @@ class CounterPlugin :
     plugin_test_resource->memcpy(plugin_test_data, &data, sizeof(CounterData));
   }
 
-  void postLaunch(const RAJA::util::PluginContext& p) override {
+  void postLaunch(const RAJA::util::PluginContext& p) override
+  {
     ASSERT_NE(plugin_test_data, nullptr);
     ASSERT_NE(plugin_test_resource, nullptr);
 
@@ -74,4 +77,5 @@ class CounterPlugin :
 };
 
 // Statically loading plugin.
-static RAJA::util::PluginRegistry::add<CounterPlugin> P("counter-plugin", "Counter");
+static RAJA::util::PluginRegistry::add<CounterPlugin> P("counter-plugin",
+                                                        "Counter");
diff --git a/test/integration/plugin/tests/counter.hpp b/test/integration/plugin/tests/counter.hpp
index bb22f697dd..a652fe9db1 100644
--- a/test/integration/plugin/tests/counter.hpp
+++ b/test/integration/plugin/tests/counter.hpp
@@ -4,18 +4,18 @@
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-#ifndef  RAJA_counter_HPP
-#define  RAJA_counter_HPP
+#ifndef RAJA_counter_HPP
+#define RAJA_counter_HPP
 
 
 struct CounterData
 {
   RAJA::Platform capture_platform_active = RAJA::Platform::undefined;
-  int            capture_counter_pre     = 0;
-  int            capture_counter_post    = 0;
-  RAJA::Platform launch_platform_active = RAJA::Platform::undefined;
-  int            launch_counter_pre     = 0;
-  int            launch_counter_post    = 0;
+  int capture_counter_pre                = 0;
+  int capture_counter_post               = 0;
+  RAJA::Platform launch_platform_active  = RAJA::Platform::undefined;
+  int launch_counter_pre                 = 0;
+  int launch_counter_post                = 0;
 };
 
 // note the use of a pointer here to allow different types of memory
diff --git a/test/integration/plugin/tests/test-plugin-forall.hpp b/test/integration/plugin/tests/test-plugin-forall.hpp
index 3b74d6249d..2db5d1c5e4 100644
--- a/test/integration/plugin/tests/test-plugin-forall.hpp
+++ b/test/integration/plugin/tests/test-plugin-forall.hpp
@@ -21,173 +21,169 @@
 // once before and after each kernel invocation for the launch counter.
 
 // test with basic forall
-template <typename ExecPolicy,
-          typename WORKING_RES,
-          RAJA::Platform PLATFORM>
+template <typename ExecPolicy, typename WORKING_RES, RAJA::Platform PLATFORM>
 void PluginForallTestImpl()
 {
   SetupPluginVars spv(WORKING_RES::get_default());
 
   CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++)
+  {
 
-    RAJA::forall<ExecPolicy>(
-      RAJA::RangeSegment(i,i+1),
-      PluginTestCallable{data}
-    );
+    RAJA::forall<ExecPolicy>(RAJA::RangeSegment(i, i + 1),
+                             PluginTestCallable {data});
 
     CounterData loop_data;
     plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
     ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.capture_counter_post,    i);
+    ASSERT_EQ(loop_data.capture_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.capture_counter_post, i);
     ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.launch_counter_post,    i);
+    ASSERT_EQ(loop_data.launch_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.launch_counter_post, i);
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                               sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-  ASSERT_EQ(plugin_data.capture_counter_post,    10);
+  ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+  ASSERT_EQ(plugin_data.capture_counter_post, 10);
   ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
-  ASSERT_EQ(plugin_data.launch_counter_post,    10);
+  ASSERT_EQ(plugin_data.launch_counter_pre, 10);
+  ASSERT_EQ(plugin_data.launch_counter_post, 10);
 
   plugin_test_resource->deallocate(data);
 }
 
 // test with basic forall_Icount
-template <typename ExecPolicy,
-          typename WORKING_RES,
-          RAJA::Platform PLATFORM>
+template <typename ExecPolicy, typename WORKING_RES, RAJA::Platform PLATFORM>
 void PluginForAllICountTestImpl()
 {
   SetupPluginVars spv(WORKING_RES::get_default());
 
   CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++)
+  {
 
-    RAJA::forall_Icount<ExecPolicy>(
-      RAJA::RangeSegment(i,i+1), i,
-      PluginTestCallable{data}
-    );
+    RAJA::forall_Icount<ExecPolicy>(RAJA::RangeSegment(i, i + 1), i,
+                                    PluginTestCallable {data});
 
     CounterData loop_data;
     plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
     ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.capture_counter_post,    i);
+    ASSERT_EQ(loop_data.capture_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.capture_counter_post, i);
     ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.launch_counter_post,    i);
+    ASSERT_EQ(loop_data.launch_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.launch_counter_post, i);
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                               sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-  ASSERT_EQ(plugin_data.capture_counter_post,    10);
+  ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+  ASSERT_EQ(plugin_data.capture_counter_post, 10);
   ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
-  ASSERT_EQ(plugin_data.launch_counter_post,    10);
+  ASSERT_EQ(plugin_data.launch_counter_pre, 10);
+  ASSERT_EQ(plugin_data.launch_counter_post, 10);
 
   plugin_test_resource->deallocate(data);
 }
 
 // test with IndexSet forall
-template <typename ExecPolicy,
-          typename WORKING_RES,
-          RAJA::Platform PLATFORM>
+template <typename ExecPolicy, typename WORKING_RES, RAJA::Platform PLATFORM>
 void PluginForAllIdxSetTestImpl()
 {
   SetupPluginVars spv(WORKING_RES::get_default());
 
   CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++)
+  {
 
-    RAJA::TypedIndexSet< RAJA::RangeSegment > iset;
+    RAJA::TypedIndexSet<RAJA::RangeSegment> iset;
 
-    for (int j = i; j < 10; j++) {
-      iset.push_back(RAJA::RangeSegment(j, j+1));
+    for (int j = i; j < 10; j++)
+    {
+      iset.push_back(RAJA::RangeSegment(j, j + 1));
     }
 
     RAJA::forall<RAJA::ExecPolicy<RAJA::seq_segit, ExecPolicy>>(
-      iset,
-      PluginTestCallable{data}
-    );
+        iset, PluginTestCallable {data});
 
-    for (int j = i; j < 10; j++) {
+    for (int j = i; j < 10; j++)
+    {
       CounterData loop_data;
       plugin_test_resource->memcpy(&loop_data, &data[j], sizeof(CounterData));
       ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
-      ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
-      ASSERT_EQ(loop_data.capture_counter_post,    i);
+      ASSERT_EQ(loop_data.capture_counter_pre, i + 1);
+      ASSERT_EQ(loop_data.capture_counter_post, i);
       ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
-      ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
-      ASSERT_EQ(loop_data.launch_counter_post,    i);
+      ASSERT_EQ(loop_data.launch_counter_pre, i + 1);
+      ASSERT_EQ(loop_data.launch_counter_post, i);
     }
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                               sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-  ASSERT_EQ(plugin_data.capture_counter_post,    10);
+  ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+  ASSERT_EQ(plugin_data.capture_counter_post, 10);
   ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
-  ASSERT_EQ(plugin_data.launch_counter_post,    10);
+  ASSERT_EQ(plugin_data.launch_counter_pre, 10);
+  ASSERT_EQ(plugin_data.launch_counter_post, 10);
 
   plugin_test_resource->deallocate(data);
 }
 
 // test with IndexSet forall_Icount
-template <typename ExecPolicy,
-          typename WORKING_RES,
-          RAJA::Platform PLATFORM>
+template <typename ExecPolicy, typename WORKING_RES, RAJA::Platform PLATFORM>
 void PluginForAllIcountIdxSetTestImpl()
 {
   SetupPluginVars spv(WORKING_RES::get_default());
 
   CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++)
+  {
 
-    RAJA::TypedIndexSet< RAJA::RangeSegment > iset;
+    RAJA::TypedIndexSet<RAJA::RangeSegment> iset;
 
-    for (int j = i; j < 10; j++) {
-      iset.push_back(RAJA::RangeSegment(j, j+1));
+    for (int j = i; j < 10; j++)
+    {
+      iset.push_back(RAJA::RangeSegment(j, j + 1));
     }
 
     RAJA::forall_Icount<RAJA::ExecPolicy<RAJA::seq_segit, ExecPolicy>>(
-      iset,
-      PluginTestCallable{data}
-    );
+        iset, PluginTestCallable {data});
 
-    for (int j = i; j < 10; j++) {
+    for (int j = i; j < 10; j++)
+    {
       CounterData loop_data;
       plugin_test_resource->memcpy(&loop_data, &data[j], sizeof(CounterData));
       ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
-      ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
-      ASSERT_EQ(loop_data.capture_counter_post,    i);
+      ASSERT_EQ(loop_data.capture_counter_pre, i + 1);
+      ASSERT_EQ(loop_data.capture_counter_post, i);
       ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
-      ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
-      ASSERT_EQ(loop_data.launch_counter_post,    i);
+      ASSERT_EQ(loop_data.launch_counter_pre, i + 1);
+      ASSERT_EQ(loop_data.launch_counter_post, i);
     }
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                               sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-  ASSERT_EQ(plugin_data.capture_counter_post,    10);
+  ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+  ASSERT_EQ(plugin_data.capture_counter_post, 10);
   ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
-  ASSERT_EQ(plugin_data.launch_counter_post,    10);
+  ASSERT_EQ(plugin_data.launch_counter_pre, 10);
+  ASSERT_EQ(plugin_data.launch_counter_post, 10);
 
   plugin_test_resource->deallocate(data);
 }
@@ -195,43 +191,43 @@ void PluginForAllIcountIdxSetTestImpl()
 TYPED_TEST_SUITE_P(PluginForallTest);
 template <typename T>
 class PluginForallTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(PluginForallTest, PluginForall)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ExecPolicy     = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginForallTestImpl<ExecPolicy, ResType, PlatformHolder::platform>( );
+  PluginForallTestImpl<ExecPolicy, ResType, PlatformHolder::platform>();
 }
 
 TYPED_TEST_P(PluginForallTest, PluginForAllICount)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ExecPolicy     = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginForAllICountTestImpl<ExecPolicy, ResType, PlatformHolder::platform>( );
+  PluginForAllICountTestImpl<ExecPolicy, ResType, PlatformHolder::platform>();
 }
 
 TYPED_TEST_P(PluginForallTest, PluginForAllIdxSet)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ExecPolicy     = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginForAllIdxSetTestImpl<ExecPolicy, ResType, PlatformHolder::platform>( );
+  PluginForAllIdxSetTestImpl<ExecPolicy, ResType, PlatformHolder::platform>();
 }
 
 TYPED_TEST_P(PluginForallTest, PluginForAllIcountIdxSet)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ExecPolicy     = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginForAllIcountIdxSetTestImpl<ExecPolicy, ResType, PlatformHolder::platform>( );
+  PluginForAllIcountIdxSetTestImpl<ExecPolicy, ResType,
+                                   PlatformHolder::platform>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(PluginForallTest,
diff --git a/test/integration/plugin/tests/test-plugin-kernel.hpp b/test/integration/plugin/tests/test-plugin-kernel.hpp
index b4bc9ebaf4..41a7cd92cd 100644
--- a/test/integration/plugin/tests/test-plugin-kernel.hpp
+++ b/test/integration/plugin/tests/test-plugin-kernel.hpp
@@ -21,40 +21,38 @@
 // once before and after each kernel invocation for the launch counter.
 
 // test with basic kernel
-template <typename KernelPolicy,
-          typename WORKING_RES,
-          RAJA::Platform PLATFORM>
+template <typename KernelPolicy, typename WORKING_RES, RAJA::Platform PLATFORM>
 void PluginKernelTestImpl()
 {
   SetupPluginVars spv(WORKING_RES::get_default());
 
   CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++)
+  {
 
-    RAJA::kernel<KernelPolicy>(
-      RAJA::make_tuple(RAJA::RangeSegment(i,i+1)),
-      PluginTestCallable{data}
-    );
+    RAJA::kernel<KernelPolicy>(RAJA::make_tuple(RAJA::RangeSegment(i, i + 1)),
+                               PluginTestCallable {data});
 
     CounterData loop_data;
     plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
     ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.capture_counter_post,    i);
+    ASSERT_EQ(loop_data.capture_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.capture_counter_post, i);
     ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.launch_counter_post,    i);
+    ASSERT_EQ(loop_data.launch_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.launch_counter_post, i);
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                               sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-  ASSERT_EQ(plugin_data.capture_counter_post,    10);
+  ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+  ASSERT_EQ(plugin_data.capture_counter_post, 10);
   ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
-  ASSERT_EQ(plugin_data.launch_counter_post,    10);
+  ASSERT_EQ(plugin_data.launch_counter_pre, 10);
+  ASSERT_EQ(plugin_data.launch_counter_post, 10);
 
   plugin_test_resource->deallocate(data);
 }
@@ -63,19 +61,17 @@ void PluginKernelTestImpl()
 TYPED_TEST_SUITE_P(PluginKernelTest);
 template <typename T>
 class PluginKernelTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(PluginKernelTest, PluginKernel)
 {
-  using KernelPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using KernelPolicy   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginKernelTestImpl<KernelPolicy, ResType, PlatformHolder::platform>( );
+  PluginKernelTestImpl<KernelPolicy, ResType, PlatformHolder::platform>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(PluginKernelTest,
-                            PluginKernel);
+REGISTER_TYPED_TEST_SUITE_P(PluginKernelTest, PluginKernel);
 
 #endif  //__TEST_PLUGIN_KERNEL_HPP__
diff --git a/test/integration/plugin/tests/test-plugin-launch.hpp b/test/integration/plugin/tests/test-plugin-launch.hpp
index 2c516114cd..b01dadee8c 100644
--- a/test/integration/plugin/tests/test-plugin-launch.hpp
+++ b/test/integration/plugin/tests/test-plugin-launch.hpp
@@ -21,48 +21,46 @@
 // once before and after each launch invocation for the launch counter.
 
 // test with basic launch
-template <typename LaunchPolicy,
-          typename WORKING_RES,
-          RAJA::Platform PLATFORM>
+template <typename LaunchPolicy, typename WORKING_RES, RAJA::Platform PLATFORM>
 void PluginLaunchTestImpl()
 {
   SetupPluginVars spv(WORKING_RES::get_default());
 
   CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++)
+  {
 
-    //Keep PluginTestCallable within a scope to ensure
-    //destruction, consistent with other test
+    // Keep PluginTestCallable within a scope to ensure
+    // destruction, consistent with other test
     {
-      PluginTestCallable p_callable{data};
-
-      RAJA::launch<LaunchPolicy>
-        (RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(1)),
-         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext RAJA_UNUSED_ARG(ctx))
-         {
-           p_callable(i);
-         });
+      PluginTestCallable p_callable {data};
+
+      RAJA::launch<LaunchPolicy>(
+          RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(1)),
+          [=] RAJA_HOST_DEVICE(RAJA::LaunchContext RAJA_UNUSED_ARG(ctx))
+          { p_callable(i); });
     }
 
     CounterData loop_data;
     plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
     ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.capture_counter_post,    i);
+    ASSERT_EQ(loop_data.capture_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.capture_counter_post, i);
     ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.launch_counter_post,    i);
+    ASSERT_EQ(loop_data.launch_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.launch_counter_post, i);
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                               sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-  ASSERT_EQ(plugin_data.capture_counter_post,    10);
+  ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+  ASSERT_EQ(plugin_data.capture_counter_post, 10);
   ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
-  ASSERT_EQ(plugin_data.launch_counter_post,    10);
+  ASSERT_EQ(plugin_data.launch_counter_pre, 10);
+  ASSERT_EQ(plugin_data.launch_counter_post, 10);
 
   plugin_test_resource->deallocate(data);
 }
@@ -71,19 +69,17 @@ void PluginLaunchTestImpl()
 TYPED_TEST_SUITE_P(PluginLaunchTest);
 template <typename T>
 class PluginLaunchTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(PluginLaunchTest, PluginLaunch)
 {
-  using LaunchPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LaunchPolicy   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginLaunchTestImpl<LaunchPolicy, ResType, PlatformHolder::platform>( );
+  PluginLaunchTestImpl<LaunchPolicy, ResType, PlatformHolder::platform>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(PluginLaunchTest,
-                            PluginLaunch);
+REGISTER_TYPED_TEST_SUITE_P(PluginLaunchTest, PluginLaunch);
 
 #endif  //__TEST_PLUGIN_LAUNCH_HPP__
diff --git a/test/integration/plugin/tests/test-plugin-resource-launch.hpp b/test/integration/plugin/tests/test-plugin-resource-launch.hpp
index e4c216b72b..b5aaef62ee 100644
--- a/test/integration/plugin/tests/test-plugin-resource-launch.hpp
+++ b/test/integration/plugin/tests/test-plugin-resource-launch.hpp
@@ -21,9 +21,7 @@
 // once before and after each launch invocation for the launch counter.
 
 // test with basic launch
-template <typename LaunchPolicy,
-          typename WORKING_RES,
-          RAJA::Platform PLATFORM>
+template <typename LaunchPolicy, typename WORKING_RES, RAJA::Platform PLATFORM>
 void PluginResourceLaunchTestImpl()
 {
   WORKING_RES res;
@@ -32,39 +30,39 @@ void PluginResourceLaunchTestImpl()
 
   CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++)
+  {
 
-    //Keep PluginTestCallable within a scope to ensure
-    //destruction, consistent with other test
+    // Keep PluginTestCallable within a scope to ensure
+    // destruction, consistent with other test
     {
-      PluginTestCallable p_callable{data};
-
-      RAJA::launch<LaunchPolicy>
-        (res, RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(1)),
-         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext RAJA_UNUSED_ARG(ctx))
-         {
-           p_callable(i);
-         });
+      PluginTestCallable p_callable {data};
+
+      RAJA::launch<LaunchPolicy>(
+          res, RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(1)),
+          [=] RAJA_HOST_DEVICE(RAJA::LaunchContext RAJA_UNUSED_ARG(ctx))
+          { p_callable(i); });
     }
 
     CounterData loop_data;
     plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
     ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.capture_counter_post,    i);
+    ASSERT_EQ(loop_data.capture_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.capture_counter_post, i);
     ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.launch_counter_post,    i);
+    ASSERT_EQ(loop_data.launch_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.launch_counter_post, i);
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                               sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-  ASSERT_EQ(plugin_data.capture_counter_post,    10);
+  ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+  ASSERT_EQ(plugin_data.capture_counter_post, 10);
   ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
-  ASSERT_EQ(plugin_data.launch_counter_post,    10);
+  ASSERT_EQ(plugin_data.launch_counter_pre, 10);
+  ASSERT_EQ(plugin_data.launch_counter_post, 10);
 
   plugin_test_resource->deallocate(data);
 }
@@ -73,19 +71,18 @@ void PluginResourceLaunchTestImpl()
 TYPED_TEST_SUITE_P(PluginResourceLaunchTest);
 template <typename T>
 class PluginResourceLaunchTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(PluginResourceLaunchTest, PluginResourceLaunch)
 {
-  using LaunchPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LaunchPolicy   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginResourceLaunchTestImpl<LaunchPolicy, ResType, PlatformHolder::platform>( );
+  PluginResourceLaunchTestImpl<LaunchPolicy, ResType,
+                               PlatformHolder::platform>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(PluginResourceLaunchTest,
-                            PluginResourceLaunch);
+REGISTER_TYPED_TEST_SUITE_P(PluginResourceLaunchTest, PluginResourceLaunch);
 
 #endif  //__TEST_PLUGIN_LAUNCH_HPP__
diff --git a/test/integration/plugin/tests/test-plugin-workgroup.hpp b/test/integration/plugin/tests/test-plugin-workgroup.hpp
index 9e35aae7d2..3f2c3e1223 100644
--- a/test/integration/plugin/tests/test-plugin-workgroup.hpp
+++ b/test/integration/plugin/tests/test-plugin-workgroup.hpp
@@ -29,181 +29,193 @@ template <typename ExecPolicy,
           typename Allocator,
           typename WORKINGRES,
           RAJA::Platform PLATFORM>
-struct PluginWorkGroupTestImpl {
-void operator()() const
+struct PluginWorkGroupTestImpl
 {
-  using range_segment = RAJA::TypedRangeSegment<IndexType>;
-
-  using DispatchPolicy = typename DispatchTyper::template type<
-      camp::list<range_segment, PluginTestCallable> >;
-
-  using WorkPool_type = RAJA::WorkPool<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  using WorkGroup_type = RAJA::WorkGroup<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  using WorkSite_type = RAJA::WorkSite<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  SetupPluginVars spv(WORKINGRES{});
-
-  CounterData* data = plugin_test_resource->allocate<CounterData>(10);
-
+  void operator()() const
   {
-    CounterData loop_data[10];
-    for (int i = 0; i < 10; i++) {
-      loop_data[i].capture_platform_active = RAJA::Platform::undefined;
-      loop_data[i].capture_counter_pre     = -1;
-      loop_data[i].capture_counter_post    = -1;
-      loop_data[i].launch_platform_active = RAJA::Platform::undefined;
-      loop_data[i].launch_counter_pre     = -1;
-      loop_data[i].launch_counter_post    = -1;
+    using range_segment = RAJA::TypedRangeSegment<IndexType>;
+
+    using DispatchPolicy = typename DispatchTyper::template type<
+        camp::list<range_segment, PluginTestCallable>>;
+
+    using WorkPool_type =
+        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
+
+    using WorkGroup_type =
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                              StoragePolicy, DispatchPolicy>,
+                        IndexType, RAJA::xargs<>, Allocator>;
+
+    using WorkSite_type =
+        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
+
+    SetupPluginVars spv(WORKINGRES {});
+
+    CounterData* data = plugin_test_resource->allocate<CounterData>(10);
+
+    {
+      CounterData loop_data[10];
+      for (int i = 0; i < 10; i++)
+      {
+        loop_data[i].capture_platform_active = RAJA::Platform::undefined;
+        loop_data[i].capture_counter_pre     = -1;
+        loop_data[i].capture_counter_post    = -1;
+        loop_data[i].launch_platform_active  = RAJA::Platform::undefined;
+        loop_data[i].launch_counter_pre      = -1;
+        loop_data[i].launch_counter_post     = -1;
+      }
+      plugin_test_resource->memcpy(data, &loop_data[0],
+                                   10 * sizeof(CounterData));
     }
-    plugin_test_resource->memcpy(data, &loop_data[0], 10*sizeof(CounterData));
-  }
-
-  WorkPool_type pool(Allocator{});
 
-  for (int i = 0; i < 10; i++) {
-    pool.enqueue(range_segment{i,i+1}, PluginTestCallable{data});
-  }
+    WorkPool_type pool(Allocator {});
 
-  {
-    CounterData plugin_data;
-    plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
-    ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-    ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-    ASSERT_EQ(plugin_data.capture_counter_post,    10);
-    ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-    ASSERT_EQ(plugin_data.launch_counter_pre,     0);
-    ASSERT_EQ(plugin_data.launch_counter_post,    0);
-  }
-
-  {
-    CounterData loop_data[10];
-    plugin_test_resource->memcpy(&loop_data[0], data, 10*sizeof(CounterData));
-
-    for (int i = 0; i < 10; i++) {
-      ASSERT_EQ(loop_data[i].capture_platform_active, RAJA::Platform::undefined);
-      ASSERT_EQ(loop_data[i].capture_counter_pre,     -1);
-      ASSERT_EQ(loop_data[i].capture_counter_post,    -1);
-      ASSERT_EQ(loop_data[i].launch_platform_active, RAJA::Platform::undefined);
-      ASSERT_EQ(loop_data[i].launch_counter_pre,     -1);
-      ASSERT_EQ(loop_data[i].launch_counter_post,    -1);
+    for (int i = 0; i < 10; i++)
+    {
+      pool.enqueue(range_segment {i, i + 1}, PluginTestCallable {data});
     }
-  }
 
-  WorkGroup_type group = pool.instantiate();
+    {
+      CounterData plugin_data;
+      plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                                   sizeof(CounterData));
+      ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
+      ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+      ASSERT_EQ(plugin_data.capture_counter_post, 10);
+      ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
+      ASSERT_EQ(plugin_data.launch_counter_pre, 0);
+      ASSERT_EQ(plugin_data.launch_counter_post, 0);
+    }
 
-  {
-    CounterData plugin_data;
-    plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
-    ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-    ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-    ASSERT_EQ(plugin_data.capture_counter_post,    10);
-    ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-    ASSERT_EQ(plugin_data.launch_counter_pre,     0);
-    ASSERT_EQ(plugin_data.launch_counter_post,    0);
-  }
+    {
+      CounterData loop_data[10];
+      plugin_test_resource->memcpy(&loop_data[0], data,
+                                   10 * sizeof(CounterData));
+
+      for (int i = 0; i < 10; i++)
+      {
+        ASSERT_EQ(loop_data[i].capture_platform_active,
+                  RAJA::Platform::undefined);
+        ASSERT_EQ(loop_data[i].capture_counter_pre, -1);
+        ASSERT_EQ(loop_data[i].capture_counter_post, -1);
+        ASSERT_EQ(loop_data[i].launch_platform_active,
+                  RAJA::Platform::undefined);
+        ASSERT_EQ(loop_data[i].launch_counter_pre, -1);
+        ASSERT_EQ(loop_data[i].launch_counter_post, -1);
+      }
+    }
 
-  {
-    CounterData loop_data[10];
-    plugin_test_resource->memcpy(&loop_data[0], data, 10*sizeof(CounterData));
-
-    for (int i = 0; i < 10; i++) {
-      ASSERT_EQ(loop_data[i].capture_platform_active, RAJA::Platform::undefined);
-      ASSERT_EQ(loop_data[i].capture_counter_pre,     -1);
-      ASSERT_EQ(loop_data[i].capture_counter_post,    -1);
-      ASSERT_EQ(loop_data[i].launch_platform_active, RAJA::Platform::undefined);
-      ASSERT_EQ(loop_data[i].launch_counter_pre,     -1);
-      ASSERT_EQ(loop_data[i].launch_counter_post,    -1);
+    WorkGroup_type group = pool.instantiate();
+
+    {
+      CounterData plugin_data;
+      plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                                   sizeof(CounterData));
+      ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
+      ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+      ASSERT_EQ(plugin_data.capture_counter_post, 10);
+      ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
+      ASSERT_EQ(plugin_data.launch_counter_pre, 0);
+      ASSERT_EQ(plugin_data.launch_counter_post, 0);
     }
-  }
 
-  WorkSite_type site = group.run();
+    {
+      CounterData loop_data[10];
+      plugin_test_resource->memcpy(&loop_data[0], data,
+                                   10 * sizeof(CounterData));
+
+      for (int i = 0; i < 10; i++)
+      {
+        ASSERT_EQ(loop_data[i].capture_platform_active,
+                  RAJA::Platform::undefined);
+        ASSERT_EQ(loop_data[i].capture_counter_pre, -1);
+        ASSERT_EQ(loop_data[i].capture_counter_post, -1);
+        ASSERT_EQ(loop_data[i].launch_platform_active,
+                  RAJA::Platform::undefined);
+        ASSERT_EQ(loop_data[i].launch_counter_pre, -1);
+        ASSERT_EQ(loop_data[i].launch_counter_post, -1);
+      }
+    }
 
-  {
-    CounterData plugin_data;
-    plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
-    ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-    ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-    ASSERT_EQ(plugin_data.capture_counter_post,    10);
-    ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-    ASSERT_EQ(plugin_data.launch_counter_pre,     1);
-    ASSERT_EQ(plugin_data.launch_counter_post,    1);
-  }
+    WorkSite_type site = group.run();
+
+    {
+      CounterData plugin_data;
+      plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                                   sizeof(CounterData));
+      ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
+      ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+      ASSERT_EQ(plugin_data.capture_counter_post, 10);
+      ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
+      ASSERT_EQ(plugin_data.launch_counter_pre, 1);
+      ASSERT_EQ(plugin_data.launch_counter_post, 1);
+    }
 
-  {
-    CounterData loop_data[10];
-    plugin_test_resource->memcpy(&loop_data, data, 10*sizeof(CounterData));
-
-    for (int i = 0; i < 10; i++) {
-      ASSERT_EQ(loop_data[i].capture_platform_active, PLATFORM);
-      ASSERT_EQ(loop_data[i].capture_counter_pre,     i+1);
-      ASSERT_EQ(loop_data[i].capture_counter_post,    i);
-      ASSERT_EQ(loop_data[i].launch_platform_active, PLATFORM);
-      ASSERT_EQ(loop_data[i].launch_counter_pre,     1);
-      ASSERT_EQ(loop_data[i].launch_counter_post,    0);
+    {
+      CounterData loop_data[10];
+      plugin_test_resource->memcpy(&loop_data, data, 10 * sizeof(CounterData));
+
+      for (int i = 0; i < 10; i++)
+      {
+        ASSERT_EQ(loop_data[i].capture_platform_active, PLATFORM);
+        ASSERT_EQ(loop_data[i].capture_counter_pre, i + 1);
+        ASSERT_EQ(loop_data[i].capture_counter_post, i);
+        ASSERT_EQ(loop_data[i].launch_platform_active, PLATFORM);
+        ASSERT_EQ(loop_data[i].launch_counter_pre, 1);
+        ASSERT_EQ(loop_data[i].launch_counter_post, 0);
+      }
     }
-  }
 
-  plugin_test_resource->deallocate(data);
-}
+    plugin_test_resource->deallocate(data);
+  }
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
           typename WORKINGRES,
-          RAJA::Platform PLATFORM
-          >
-struct PluginWorkGroupTestImpl<RAJA::hip_work<BLOCK_SIZE, Async>,
-                               RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                               StoragePolicy,
-                               detail::indirect_function_call_dispatch_typer,
-                               IndexType,
-                               Allocator,
-                               WORKINGRES,
-                               PLATFORM> {
-void operator()() const
-{ }
+          RAJA::Platform PLATFORM>
+struct PluginWorkGroupTestImpl<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_function_call_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKINGRES,
+    PLATFORM>
+{
+  void operator()() const {}
 };
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
           typename WORKINGRES,
-          RAJA::Platform PLATFORM
-          >
-struct PluginWorkGroupTestImpl<RAJA::hip_work<BLOCK_SIZE, Async>,
-                               RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                               StoragePolicy,
-                               detail::indirect_virtual_function_dispatch_typer,
-                               IndexType,
-                               Allocator,
-                               WORKINGRES,
-                               PLATFORM> {
-void operator()() const
-{ }
+          RAJA::Platform PLATFORM>
+struct PluginWorkGroupTestImpl<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_virtual_function_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKINGRES,
+    PLATFORM>
+{
+  void operator()() const {}
 };
 
 #endif
@@ -212,24 +224,24 @@ void operator()() const
 TYPED_TEST_SUITE_P(PluginWorkGroupTest);
 template <typename T>
 class PluginWorkGroupTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(PluginWorkGroupTest, PluginWorkGroup)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
-  using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
+  using ExecPolicy       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy      = typename camp::at<TypeParam, camp::num<1>>::type;
+  using StoragePolicy    = typename camp::at<TypeParam, camp::num<2>>::type;
+  using DispatchTyper    = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IndexType        = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Allocator        = typename camp::at<TypeParam, camp::num<5>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
-  using PlatformHolder = typename camp::at<TypeParam, camp::num<7>>::type;
+  using PlatformHolder   = typename camp::at<TypeParam, camp::num<7>>::type;
 
-  PluginWorkGroupTestImpl<ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE, PlatformHolder::platform>{}( );
+  PluginWorkGroupTestImpl<ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper,
+                          IndexType, Allocator, WORKING_RESOURCE,
+                          PlatformHolder::platform> {}();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(PluginWorkGroupTest,
-                            PluginWorkGroup);
+REGISTER_TYPED_TEST_SUITE_P(PluginWorkGroupTest, PluginWorkGroup);
 
 #endif  //__TEST_PLUGIN_WORKGROUP_HPP__
diff --git a/test/integration/plugin/tests/test-plugin.hpp b/test/integration/plugin/tests/test-plugin.hpp
index 3371cb299b..aca9c8e47b 100644
--- a/test/integration/plugin/tests/test-plugin.hpp
+++ b/test/integration/plugin/tests/test-plugin.hpp
@@ -6,7 +6,8 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Header file containing basic functional tests for atomic operations with forall and views.
+/// Header file containing basic functional tests for atomic operations with
+/// forall and views.
 ///
 
 #ifndef __TEST_PLUGIN_HPP__
@@ -26,29 +27,29 @@ camp::resources::Resource* plugin_test_resource = nullptr;
 struct SetupPluginVars
 {
   SetupPluginVars(camp::resources::Resource const test_resource)
-    : m_test_resource(test_resource)
+      : m_test_resource(test_resource)
   {
     // ASSERT_EQ(plugin_test_data, nullptr);
     // ASSERT_EQ(plugin_test_resource, nullptr);
 
-    plugin_test_data = m_test_resource.allocate<CounterData>(1);
+    plugin_test_data     = m_test_resource.allocate<CounterData>(1);
     plugin_test_resource = &m_test_resource;
 
     CounterData data;
     data.capture_platform_active = RAJA::Platform::undefined;
     data.capture_counter_pre     = 0;
     data.capture_counter_post    = 0;
-    data.launch_platform_active = RAJA::Platform::undefined;
-    data.launch_counter_pre     = 0;
-    data.launch_counter_post    = 0;
+    data.launch_platform_active  = RAJA::Platform::undefined;
+    data.launch_counter_pre      = 0;
+    data.launch_counter_post     = 0;
 
     m_test_resource.memcpy(plugin_test_data, &data, sizeof(CounterData));
   }
 
-  SetupPluginVars(SetupPluginVars const&) = delete;
-  SetupPluginVars(SetupPluginVars &&) = delete;
+  SetupPluginVars(SetupPluginVars const&)            = delete;
+  SetupPluginVars(SetupPluginVars&&)                 = delete;
   SetupPluginVars& operator=(SetupPluginVars const&) = delete;
-  SetupPluginVars& operator=(SetupPluginVars &&) = delete;
+  SetupPluginVars& operator=(SetupPluginVars&&)      = delete;
 
   ~SetupPluginVars()
   {
@@ -56,7 +57,7 @@ struct SetupPluginVars
     // ASSERT_NE(plugin_test_resource, nullptr);
 
     m_test_resource.deallocate(plugin_test_data);
-    plugin_test_data = nullptr;
+    plugin_test_data     = nullptr;
     plugin_test_resource = nullptr;
   }
 
@@ -68,16 +69,15 @@ struct SetupPluginVars
 struct PluginTestCallable
 {
   PluginTestCallable(CounterData* data_optr)
-    : m_data_optr(data_optr)
-    , m_data_iptr(plugin_test_data)
+      : m_data_optr(data_optr), m_data_iptr(plugin_test_data)
   {
     clear_data();
   }
 
   RAJA_HOST_DEVICE PluginTestCallable(PluginTestCallable const& rhs)
-    : m_data_optr(rhs.m_data_optr)
-    , m_data_iptr(rhs.m_data_iptr)
-    , m_data(rhs.m_data)
+      : m_data_optr(rhs.m_data_optr),
+        m_data_iptr(rhs.m_data_iptr),
+        m_data(rhs.m_data)
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
@@ -88,24 +88,26 @@ struct PluginTestCallable
       plugin_test_resource->memcpy(&i_data, m_data_iptr, sizeof(CounterData));
 
       if (m_data.capture_platform_active == RAJA::Platform::undefined &&
-          i_data.capture_platform_active != RAJA::Platform::undefined) {
+          i_data.capture_platform_active != RAJA::Platform::undefined)
+      {
         m_data = i_data;
       }
     }
 #endif
   }
 
-  RAJA_HOST_DEVICE PluginTestCallable(PluginTestCallable && rhs)
-    : m_data_optr(rhs.m_data_optr)
-    , m_data_iptr(rhs.m_data_iptr)
-    , m_data(rhs.m_data)
+  RAJA_HOST_DEVICE PluginTestCallable(PluginTestCallable&& rhs)
+      : m_data_optr(rhs.m_data_optr),
+        m_data_iptr(rhs.m_data_iptr),
+        m_data(rhs.m_data)
   {
     rhs.clear();
   }
 
   RAJA_HOST_DEVICE PluginTestCallable& operator=(PluginTestCallable const& rhs)
   {
-    if (this != &rhs) {
+    if (this != &rhs)
+    {
       m_data_optr = rhs.m_data_optr;
       m_data_iptr = rhs.m_data_iptr;
       m_data      = rhs.m_data;
@@ -113,9 +115,10 @@ struct PluginTestCallable
     return *this;
   }
 
-  RAJA_HOST_DEVICE PluginTestCallable& operator=(PluginTestCallable && rhs)
+  RAJA_HOST_DEVICE PluginTestCallable& operator=(PluginTestCallable&& rhs)
   {
-    if (this != &rhs) {
+    if (this != &rhs)
+    {
       m_data_optr = rhs.m_data_optr;
       m_data_iptr = rhs.m_data_iptr;
       m_data      = rhs.m_data;
@@ -141,9 +144,9 @@ struct PluginTestCallable
   }
 
 private:
-        CounterData* m_data_optr = nullptr;
+  CounterData* m_data_optr       = nullptr;
   const CounterData* m_data_iptr = nullptr;
-        CounterData  m_data;
+  CounterData m_data;
 
 
   RAJA_HOST_DEVICE void clear()
@@ -158,9 +161,9 @@ struct PluginTestCallable
     m_data.capture_platform_active = RAJA::Platform::undefined;
     m_data.capture_counter_pre     = -1;
     m_data.capture_counter_post    = -1;
-    m_data.launch_platform_active = RAJA::Platform::undefined;
-    m_data.launch_counter_pre     = -1;
-    m_data.launch_counter_post    = -1;
+    m_data.launch_platform_active  = RAJA::Platform::undefined;
+    m_data.launch_counter_pre      = -1;
+    m_data.launch_counter_post     = -1;
   }
 };
 
diff --git a/test/integration/plugin_for_test_dynamic.cpp b/test/integration/plugin_for_test_dynamic.cpp
index dfd04f0a50..84a65d422a 100644
--- a/test/integration/plugin_for_test_dynamic.cpp
+++ b/test/integration/plugin_for_test_dynamic.cpp
@@ -8,16 +8,16 @@
 
 #include <exception>
 
-class ExceptionPlugin :
-  public RAJA::util::PluginStrategy
+class ExceptionPlugin : public RAJA::util::PluginStrategy
 {
-  public:
-  void preLaunch(const RAJA::util::PluginContext& RAJA_UNUSED_ARG(p)) override {
+public:
+  void preLaunch(const RAJA::util::PluginContext& RAJA_UNUSED_ARG(p)) override
+  {
     throw std::runtime_error("preLaunch");
   }
 };
 
-extern "C" RAJA::util::PluginStrategy *getPlugin()
+extern "C" RAJA::util::PluginStrategy* getPlugin()
 {
   return new ExceptionPlugin;
 }
diff --git a/test/integration/plugin_for_test_kokkos.cpp b/test/integration/plugin_for_test_kokkos.cpp
index d5bbc5a51d..f934d864f7 100644
--- a/test/integration/plugin_for_test_kokkos.cpp
+++ b/test/integration/plugin_for_test_kokkos.cpp
@@ -9,15 +9,19 @@
 
 #include <exception>
 
-extern "C" void kokkosp_init_library(const int RAJA_UNUSED_ARG(loadSeq),
-	const uint64_t RAJA_UNUSED_ARG(interfaceVer),
-	const uint32_t RAJA_UNUSED_ARG(devInfoCount),
-	void* RAJA_UNUSED_ARG(deviceInfo)) {}
+extern "C" void
+kokkosp_init_library(const int RAJA_UNUSED_ARG(loadSeq),
+                     const uint64_t RAJA_UNUSED_ARG(interfaceVer),
+                     const uint32_t RAJA_UNUSED_ARG(devInfoCount),
+                     void* RAJA_UNUSED_ARG(deviceInfo))
+{}
 
-extern "C" void kokkosp_begin_parallel_for(const char* RAJA_UNUSED_ARG(name),
-    const uint32_t RAJA_UNUSED_ARG(devID),
-    uint64_t* RAJA_UNUSED_ARG(kID)) {
-    throw std::runtime_error("preLaunch");
+extern "C" void
+kokkosp_begin_parallel_for(const char* RAJA_UNUSED_ARG(name),
+                           const uint32_t RAJA_UNUSED_ARG(devID),
+                           uint64_t* RAJA_UNUSED_ARG(kID))
+{
+  throw std::runtime_error("preLaunch");
 }
 
 extern "C" void kokkosp_end_parallel_for(const uint64_t RAJA_UNUSED_ARG(kID)) {}
diff --git a/test/integration/test_plugin_dynamic.cpp b/test/integration/test_plugin_dynamic.cpp
index 9cba6d0a77..5a3f157e97 100644
--- a/test/integration/test_plugin_dynamic.cpp
+++ b/test/integration/test_plugin_dynamic.cpp
@@ -14,7 +14,7 @@ TEST(PluginTestDynamic, Exception)
 
   ASSERT_ANY_THROW({
     RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 10),
-                               [=](int i) { a[i] = 0; });
+                                 [=](int i) { a[i] = 0; });
   });
 
   delete[] a;
diff --git a/test/integration/test_plugin_kokkos.cpp b/test/integration/test_plugin_kokkos.cpp
index b8f05d8fef..521870494b 100644
--- a/test/integration/test_plugin_kokkos.cpp
+++ b/test/integration/test_plugin_kokkos.cpp
@@ -14,7 +14,7 @@ TEST(PluginTestKokkos, Exception)
 
   ASSERT_ANY_THROW({
     RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 10),
-                               [=](int i) { a[i] = 0; });
+                                 [=](int i) { a[i] = 0; });
   });
 
   delete[] a;
diff --git a/test/old-tests/unit/cpu/test-synchronize.cpp b/test/old-tests/unit/cpu/test-synchronize.cpp
index 7750fcea5c..f933804f02 100644
--- a/test/old-tests/unit/cpu/test-synchronize.cpp
+++ b/test/old-tests/unit/cpu/test-synchronize.cpp
@@ -17,7 +17,8 @@ TEST(SynchronizeTest, omp)
 
 #pragma omp parallel shared(test_val)
   {
-    if (omp_get_thread_num() == 0) {
+    if (omp_get_thread_num() == 0)
+    {
       test_val = 5.0;
     }
 
diff --git a/test/old-tests/unit/cuda/test-synchronize.cpp b/test/old-tests/unit/cuda/test-synchronize.cpp
index b26b7a3445..dd46bfbdc1 100644
--- a/test/old-tests/unit/cuda/test-synchronize.cpp
+++ b/test/old-tests/unit/cuda/test-synchronize.cpp
@@ -16,16 +16,14 @@ GPU_TEST(SynchronizeTest, CUDA)
   double* managed_data;
   cudaErrchk(cudaMallocManaged(&managed_data, sizeof(double) * 50));
 
-  RAJA::forall<RAJA::cuda_exec_async<256>>( RAJA::RangeSegment(0, 50),
-    [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
-    managed_data[i] = 1.0 * i;
-  });
+  RAJA::forall<RAJA::cuda_exec_async<256>>(
+      RAJA::RangeSegment(0, 50),
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i) { managed_data[i] = 1.0 * i; });
   RAJA::synchronize<RAJA::cuda_synchronize>();
 
-  RAJA::forall<RAJA::seq_exec>( RAJA::RangeSegment(0, 50),
-    [=](RAJA::Index_type i) {
-    EXPECT_EQ(managed_data[i], 1.0 * i);
-  });
+  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 50),
+                               [=](RAJA::Index_type i)
+                               { EXPECT_EQ(managed_data[i], 1.0 * i); });
 
   cudaErrchk(cudaFree(managed_data));
 }
diff --git a/test/old-tests/unit/test-sharedmem.cpp b/test/old-tests/unit/test-sharedmem.cpp
index a6f4ffcbc5..504a850576 100644
--- a/test/old-tests/unit/test-sharedmem.cpp
+++ b/test/old-tests/unit/test-sharedmem.cpp
@@ -24,9 +24,9 @@
 using namespace RAJA;
 using namespace RAJA::statement;
 
-//Define tile size ( TILE_DIM x TILE_DIM )
-//Matrix transpose and matrix multiplication
-//are carried out via tiling algorithms
+// Define tile size ( TILE_DIM x TILE_DIM )
+// Matrix transpose and matrix multiplication
+// are carried out via tiling algorithms
 RAJA_INDEX_VALUE(TX, "TX");
 RAJA_INDEX_VALUE(TY, "TY");
 
@@ -45,73 +45,85 @@ GPU_TYPED_TEST_P(TypedLocalMem, Basic)
 {
   using Pol = at_v<TypeParam, 0>;
 
-  const int DIM = 2;
+  const int DIM    = 2;
   const int N_rows = 144;
   const int N_cols = 255;
 
   const int inner_Dim0 = TILE_DIM;
   const int inner_Dim1 = TILE_DIM;
 
-  const int outer_Dim0 = (N_cols-1)/TILE_DIM+1;
-  const int outer_Dim1 = (N_rows-1)/TILE_DIM+1;
+  const int outer_Dim0 = (N_cols - 1) / TILE_DIM + 1;
+  const int outer_Dim1 = (N_rows - 1) / TILE_DIM + 1;
 
   double *A, *B;
 #if defined(RAJA_ENABLE_CUDA)
   size_t Arr_sz = N_rows * N_cols;
-  cudaErrchk(cudaMallocManaged(&A,  sizeof(double) * Arr_sz));
-  cudaErrchk(cudaMallocManaged(&B, sizeof(double)  * Arr_sz));
+  cudaErrchk(cudaMallocManaged(&A, sizeof(double) * Arr_sz));
+  cudaErrchk(cudaMallocManaged(&B, sizeof(double) * Arr_sz));
 #else
-  A  = new double[N_rows * N_cols];
-  B  = new double[N_rows * N_cols];
+  A = new double[N_rows * N_cols];
+  B = new double[N_rows * N_cols];
 #endif
 
   RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Aview(A, N_rows, N_cols);
   RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Bview(B, N_rows, N_cols);
 
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col= 0 ; col < N_cols; ++col) {
-      A[col + N_cols*row] = col;
+  for (int row = 0; row < N_rows; ++row)
+  {
+    for (int col = 0; col < N_cols; ++col)
+    {
+      A[col + N_cols * row] = col;
     }
   }
 
-  using SharedTile = AtomicTypedLocalArray<RAJA::auto_atomic, double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM,TILE_DIM>, TY, TX>;
+  using SharedTile =
+      AtomicTypedLocalArray<RAJA::auto_atomic, double, RAJA::PERM_IJ,
+                            RAJA::SizeList<TILE_DIM, TILE_DIM>, TY, TX>;
   SharedTile myTile, myTile2;
 
   const TX TX_TILE_DIM(16);
   const TY TY_TILE_DIM(16);
 
-  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::TypedRangeSegment<TX>(0, inner_Dim0), RAJA::TypedRangeSegment<TY>(0,inner_Dim1),
-                                           RAJA::TypedRangeSegment<TX>(0, outer_Dim0), RAJA::TypedRangeSegment<TY>(0,outer_Dim1)),
-                          RAJA::make_tuple(myTile, myTile2),
-
-  //Load data into shared memory
-  [=] RAJA_HOST_DEVICE (TX tx, TY ty, TX bx, TY by, SharedTile &myTile, SharedTile &) {
-
-    TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
-    TY row = by * TY_TILE_DIM + ty;  // Matrix row index
-
-    if(row < N_rows && col < N_cols){
-      myTile(ty,tx)   = Aview(row, col);
-    }
-
-  },
-
-  //read from shared mem
-  [=] RAJA_HOST_DEVICE (TX tx, TY ty, TX bx, TY by, SharedTile &myTile, SharedTile &) {
-
-    TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
-    TY row = by * TY_TILE_DIM + ty;  // Matrix row index
-
-    if(row < N_rows && col < N_cols){
-      Bview(row, col) = myTile(ty, tx);
-    }
-
-  });
+  RAJA::kernel_param<Pol>(
+      RAJA::make_tuple(RAJA::TypedRangeSegment<TX>(0, inner_Dim0),
+                       RAJA::TypedRangeSegment<TY>(0, inner_Dim1),
+                       RAJA::TypedRangeSegment<TX>(0, outer_Dim0),
+                       RAJA::TypedRangeSegment<TY>(0, outer_Dim1)),
+      RAJA::make_tuple(myTile, myTile2),
+
+      // Load data into shared memory
+      [=] RAJA_HOST_DEVICE(TX tx, TY ty, TX bx, TY by, SharedTile & myTile,
+                           SharedTile&)
+      {
+        TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
+        TY row = by * TY_TILE_DIM + ty;  // Matrix row index
+
+        if (row < N_rows && col < N_cols)
+        {
+          myTile(ty, tx) = Aview(row, col);
+        }
+      },
+
+      // read from shared mem
+      [=] RAJA_HOST_DEVICE(TX tx, TY ty, TX bx, TY by, SharedTile & myTile,
+                           SharedTile&)
+      {
+        TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
+        TY row = by * TY_TILE_DIM + ty;  // Matrix row index
+
+        if (row < N_rows && col < N_cols)
+        {
+          Bview(row, col) = myTile(ty, tx);
+        }
+      });
 
-  //Check result
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col = 0; col < N_cols; ++col) {
-      ASSERT_FLOAT_EQ((double)B[col + row*N_cols], (double)A[col + row*N_cols]);
+  // Check result
+  for (int row = 0; row < N_rows; ++row)
+  {
+    for (int col = 0; col < N_cols; ++col)
+    {
+      ASSERT_FLOAT_EQ((double)B[col + row * N_cols],
+                      (double)A[col + row * N_cols]);
     }
   }
 
@@ -119,8 +131,8 @@ GPU_TYPED_TEST_P(TypedLocalMem, Basic)
   cudaErrchk(cudaFree(A));
   cudaErrchk(cudaFree(B));
 #else
-  delete [] A;
-  delete [] B;
+  delete[] A;
+  delete[] B;
 #endif
 }
 
@@ -140,92 +152,105 @@ GPU_TYPED_TEST_P(TypedLocalMem_gpu, Basic)
 {
   using Pol = at_v<TypeParam, 0>;
 
-  const int DIM = 2;
+  const int DIM    = 2;
   const int N_rows = 144;
   const int N_cols = 255;
 
   const int inner_Dim0 = TILE_DIM;
   const int inner_Dim1 = TILE_DIM;
 
-  const int outer_Dim0 = (N_cols-1)/TILE_DIM+1;
-  const int outer_Dim1 = (N_rows-1)/TILE_DIM+1;
+  const int outer_Dim0 = (N_cols - 1) / TILE_DIM + 1;
+  const int outer_Dim1 = (N_rows - 1) / TILE_DIM + 1;
 
   double *A, *B;
   double *d_A, *d_B;
   size_t Arr_sz = N_rows * N_cols;
   hipMalloc(&d_A, sizeof(double) * Arr_sz);
   hipMalloc(&d_B, sizeof(double) * Arr_sz);
-  A  = new double[N_rows * N_cols];
-  B  = new double[N_rows * N_cols];
+  A = new double[N_rows * N_cols];
+  B = new double[N_rows * N_cols];
 
   RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Aview(A, N_rows, N_cols);
   RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Bview(B, N_rows, N_cols);
-  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Aview(d_A, N_rows, N_cols);
-  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Bview(d_B, N_rows, N_cols);
+  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Aview(d_A, N_rows,
+                                                             N_cols);
+  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Bview(d_B, N_rows,
+                                                             N_cols);
 
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col= 0 ; col < N_cols; ++col) {
-      A[col + N_cols*row] = col;
+  for (int row = 0; row < N_rows; ++row)
+  {
+    for (int col = 0; col < N_cols; ++col)
+    {
+      A[col + N_cols * row] = col;
     }
   }
 
-  hipMemcpy(d_A, A, Arr_sz*sizeof(double), hipMemcpyHostToDevice);
+  hipMemcpy(d_A, A, Arr_sz * sizeof(double), hipMemcpyHostToDevice);
 
-  using SharedTile = TypedLocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM,TILE_DIM>, TY, TX>;
+  using SharedTile =
+      TypedLocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM, TILE_DIM>,
+                      TY, TX>;
   SharedTile myTile, myTile2;
 
   const TX TX_TILE_DIM(16);
   const TY TY_TILE_DIM(16);
 
-  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::TypedRangeSegment<TX>(0, inner_Dim0), RAJA::TypedRangeSegment<TY>(0,inner_Dim1),
-                                           RAJA::TypedRangeSegment<TX>(0, outer_Dim0), RAJA::TypedRangeSegment<TY>(0,outer_Dim1)),
-                          RAJA::make_tuple(myTile, myTile2),
-
-  //Load data into shared memory
-  [=] RAJA_HOST_DEVICE (TX tx, TY ty, TX bx, TY by, SharedTile &myTile, SharedTile &) {
-
-    TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
-    TY row = by * TY_TILE_DIM + ty;  // Matrix row index
-
-    if(row < N_rows && col < N_cols){
-      myTile(ty,tx)   = d_Aview(row, col);
-    }
-
-  },
-
-  //read from shared mem
-  [=] RAJA_HOST_DEVICE (TX tx, TY ty, TX bx, TY by, SharedTile &myTile, SharedTile &) {
-
-    TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
-    TY row = by * TY_TILE_DIM + ty;  // Matrix row index
-
-    if(row < N_rows && col < N_cols){
-      d_Bview(row, col) = myTile(ty, tx);
-    }
-
-  });
+  RAJA::kernel_param<Pol>(
+      RAJA::make_tuple(RAJA::TypedRangeSegment<TX>(0, inner_Dim0),
+                       RAJA::TypedRangeSegment<TY>(0, inner_Dim1),
+                       RAJA::TypedRangeSegment<TX>(0, outer_Dim0),
+                       RAJA::TypedRangeSegment<TY>(0, outer_Dim1)),
+      RAJA::make_tuple(myTile, myTile2),
+
+      // Load data into shared memory
+      [=] RAJA_HOST_DEVICE(TX tx, TY ty, TX bx, TY by, SharedTile & myTile,
+                           SharedTile&)
+      {
+        TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
+        TY row = by * TY_TILE_DIM + ty;  // Matrix row index
+
+        if (row < N_rows && col < N_cols)
+        {
+          myTile(ty, tx) = d_Aview(row, col);
+        }
+      },
+
+      // read from shared mem
+      [=] RAJA_HOST_DEVICE(TX tx, TY ty, TX bx, TY by, SharedTile & myTile,
+                           SharedTile&)
+      {
+        TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
+        TY row = by * TY_TILE_DIM + ty;  // Matrix row index
+
+        if (row < N_rows && col < N_cols)
+        {
+          d_Bview(row, col) = myTile(ty, tx);
+        }
+      });
 
-  hipMemcpy(B, d_B, Arr_sz*sizeof(double), hipMemcpyDeviceToHost);
+  hipMemcpy(B, d_B, Arr_sz * sizeof(double), hipMemcpyDeviceToHost);
 
-  //Check result
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col = 0; col < N_cols; ++col) {
-      ASSERT_FLOAT_EQ(B[col + row*N_cols], A[col + row*N_cols]);
+  // Check result
+  for (int row = 0; row < N_rows; ++row)
+  {
+    for (int col = 0; col < N_cols; ++col)
+    {
+      ASSERT_FLOAT_EQ(B[col + row * N_cols], A[col + row * N_cols]);
     }
   }
 
   hipFree(d_A);
   hipFree(d_B);
-  delete [] A;
-  delete [] B;
+  delete[] A;
+  delete[] B;
 }
 
 REGISTER_TYPED_TEST_SUITE_P(TypedLocalMem_gpu, Basic);
-#endif //defined(RAJA_ENABLE_HIP)
+#endif  // defined(RAJA_ENABLE_HIP)
 
 
 //
-//Matrix transpose example - test all variants
+// Matrix transpose example - test all variants
 //
 template <typename NestedPolicy>
 class MatTranspose : public ::testing::Test
@@ -241,21 +266,21 @@ GPU_TYPED_TEST_P(MatTranspose, Basic)
 
   using Pol = at_v<TypeParam, 0>;
 
-  const int DIM = 2;
+  const int DIM    = 2;
   const int N_rows = 144;
   const int N_cols = 255;
 
   const int inner_Dim0 = TILE_DIM;
   const int inner_Dim1 = TILE_DIM;
 
-  const int outer_Dim0 = (N_cols-1)/TILE_DIM+1;
-  const int outer_Dim1 = (N_rows-1)/TILE_DIM+1;
+  const int outer_Dim0 = (N_cols - 1) / TILE_DIM + 1;
+  const int outer_Dim1 = (N_rows - 1) / TILE_DIM + 1;
 
   double *A, *At, *B, *Bt;
 #if defined(RAJA_ENABLE_CUDA)
-  cudaErrchk(cudaMallocManaged(&A,  sizeof(double) * N_rows * N_cols));
+  cudaErrchk(cudaMallocManaged(&A, sizeof(double) * N_rows * N_cols));
   cudaErrchk(cudaMallocManaged(&At, sizeof(double) * N_rows * N_cols));
-  cudaErrchk(cudaMallocManaged(&B,  sizeof(double) * N_rows * N_cols));
+  cudaErrchk(cudaMallocManaged(&B, sizeof(double) * N_rows * N_cols));
   cudaErrchk(cudaMallocManaged(&Bt, sizeof(double) * N_rows * N_cols));
 #else
   A  = new double[N_rows * N_cols];
@@ -271,53 +296,62 @@ GPU_TYPED_TEST_P(MatTranspose, Basic)
   RAJA::View<double, RAJA::Layout<DIM>> Btview(Bt, N_cols, N_rows);
 
 
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col = 0; col < N_cols; ++col) {
+  for (int row = 0; row < N_rows; ++row)
+  {
+    for (int col = 0; col < N_cols; ++col)
+    {
       Aview(row, col) = col;
       Bview(row, col) = col;
     }
   }
 
 
-  using SharedTile = LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM,TILE_DIM>>;
+  using SharedTile =
+      LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM, TILE_DIM>>;
 
   SharedTile myTile, myTile2;
 
-  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::RangeSegment(0, inner_Dim0), RAJA::RangeSegment(0,inner_Dim1),
-                                           RAJA::RangeSegment(0, outer_Dim0), RAJA::RangeSegment(0,outer_Dim1)),
-                          RAJA::make_tuple(myTile, myTile2),
-
-  //Load data into shared memory
-  [=] RAJA_HOST_DEVICE (int tx, int ty, int bx, int by, SharedTile &myTile, SharedTile &myTile2) {
-
-    int col = bx * TILE_DIM + tx;  // Matrix column index
-    int row = by * TILE_DIM + ty;  // Matrix row index
-
-    if(row < N_rows && col < N_cols){
-      myTile(ty,tx)  = Aview(row, col);
-      myTile2(ty,tx) = Bview(row, col);
-    }
-
-  },
-
-  //read from shared mem
-  [=] RAJA_HOST_DEVICE (int tx, int ty, int bx, int by, SharedTile &myTile, SharedTile &myTile2) {
-
-    int col = by * TILE_DIM + tx;  // Transposed matrix column index
-    int row = bx * TILE_DIM + ty;  // Transposed matrix row index
-
-    if(row < N_cols && col < N_rows){
-      Atview(row, col) = myTile(tx,ty);
-      Btview(row, col) = myTile2(tx,ty);
-    }
-
-  });
+  RAJA::kernel_param<Pol>(
+      RAJA::make_tuple(
+          RAJA::RangeSegment(0, inner_Dim0), RAJA::RangeSegment(0, inner_Dim1),
+          RAJA::RangeSegment(0, outer_Dim0), RAJA::RangeSegment(0, outer_Dim1)),
+      RAJA::make_tuple(myTile, myTile2),
+
+      // Load data into shared memory
+      [=] RAJA_HOST_DEVICE(int tx, int ty, int bx, int by, SharedTile& myTile,
+                           SharedTile& myTile2)
+      {
+        int col = bx * TILE_DIM + tx;  // Matrix column index
+        int row = by * TILE_DIM + ty;  // Matrix row index
+
+        if (row < N_rows && col < N_cols)
+        {
+          myTile(ty, tx)  = Aview(row, col);
+          myTile2(ty, tx) = Bview(row, col);
+        }
+      },
+
+      // read from shared mem
+      [=] RAJA_HOST_DEVICE(int tx, int ty, int bx, int by, SharedTile& myTile,
+                           SharedTile& myTile2)
+      {
+        int col = by * TILE_DIM + tx;  // Transposed matrix column index
+        int row = bx * TILE_DIM + ty;  // Transposed matrix row index
+
+        if (row < N_cols && col < N_rows)
+        {
+          Atview(row, col) = myTile(tx, ty);
+          Btview(row, col) = myTile2(tx, ty);
+        }
+      });
 
-  //Check result
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col = 0; col < N_cols; ++col) {
-      ASSERT_FLOAT_EQ((double)Atview(col,row), (double)col);
-      ASSERT_FLOAT_EQ((double)Btview(col,row), (double)col);
+  // Check result
+  for (int row = 0; row < N_rows; ++row)
+  {
+    for (int col = 0; col < N_cols; ++col)
+    {
+      ASSERT_FLOAT_EQ((double)Atview(col, row), (double)col);
+      ASSERT_FLOAT_EQ((double)Btview(col, row), (double)col);
     }
   }
 
@@ -328,10 +362,10 @@ GPU_TYPED_TEST_P(MatTranspose, Basic)
   cudaErrchk(cudaFree(B));
   cudaErrchk(cudaFree(Bt));
 #else
-  delete [] A;
-  delete [] At;
-  delete [] B;
-  delete [] Bt;
+  delete[] A;
+  delete[] At;
+  delete[] B;
+  delete[] Bt;
 #endif
 }
 
@@ -353,21 +387,21 @@ GPU_TYPED_TEST_P(MatTranspose_gpu, Basic)
 
   using Pol = at_v<TypeParam, 0>;
 
-  const int DIM = 2;
+  const int DIM    = 2;
   const int N_rows = 144;
   const int N_cols = 255;
 
   const int inner_Dim0 = TILE_DIM;
   const int inner_Dim1 = TILE_DIM;
 
-  const int outer_Dim0 = (N_cols-1)/TILE_DIM+1;
-  const int outer_Dim1 = (N_rows-1)/TILE_DIM+1;
+  const int outer_Dim0 = (N_cols - 1) / TILE_DIM + 1;
+  const int outer_Dim1 = (N_rows - 1) / TILE_DIM + 1;
 
   double *A, *At, *B, *Bt;
   double *d_A, *d_At, *d_B, *d_Bt;
-  hipMalloc(&d_A,  sizeof(double) * N_rows * N_cols);
+  hipMalloc(&d_A, sizeof(double) * N_rows * N_cols);
   hipMalloc(&d_At, sizeof(double) * N_rows * N_cols);
-  hipMalloc(&d_B,  sizeof(double) * N_rows * N_cols);
+  hipMalloc(&d_B, sizeof(double) * N_rows * N_cols);
   hipMalloc(&d_Bt, sizeof(double) * N_rows * N_cols);
   A  = new double[N_rows * N_cols];
   At = new double[N_rows * N_cols];
@@ -387,8 +421,10 @@ GPU_TYPED_TEST_P(MatTranspose_gpu, Basic)
   RAJA::View<double, RAJA::Layout<DIM>> d_Btview(d_Bt, N_cols, N_rows);
 
 
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col = 0; col < N_cols; ++col) {
+  for (int row = 0; row < N_rows; ++row)
+  {
+    for (int col = 0; col < N_cols; ++col)
+    {
       Aview(row, col) = col;
       Bview(row, col) = col;
     }
@@ -398,48 +434,55 @@ GPU_TYPED_TEST_P(MatTranspose_gpu, Basic)
   hipMemcpy(d_B, B, N_rows * N_cols * sizeof(double), hipMemcpyHostToDevice);
 
 
-  using SharedTile = LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM,TILE_DIM>>;
+  using SharedTile =
+      LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM, TILE_DIM>>;
 
   SharedTile myTile, myTile2;
 
-  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::RangeSegment(0, inner_Dim0), RAJA::RangeSegment(0,inner_Dim1),
-                                           RAJA::RangeSegment(0, outer_Dim0), RAJA::RangeSegment(0,outer_Dim1)),
-                          RAJA::make_tuple(myTile, myTile2),
-
-  //Load data into shared memory
-  [=] RAJA_HOST_DEVICE (int tx, int ty, int bx, int by, SharedTile &myTile, SharedTile &myTile2) {
-
-    int col = bx * TILE_DIM + tx;  // Matrix column index
-    int row = by * TILE_DIM + ty;  // Matrix row index
-
-    if(row < N_rows && col < N_cols){
-      myTile(ty,tx)  = d_Aview(row, col);
-      myTile2(ty,tx) = d_Bview(row, col);
-    }
-
-  },
-
-  //read from shared mem
-  [=] RAJA_HOST_DEVICE (int tx, int ty, int bx, int by, SharedTile &myTile, SharedTile &myTile2) {
-
-    int col = by * TILE_DIM + tx;  // Transposed matrix column index
-    int row = bx * TILE_DIM + ty;  // Transposed matrix row index
-
-    if(row < N_cols && col < N_rows){
-      d_Atview(row, col) = myTile(tx,ty);
-      d_Btview(row, col) = myTile2(tx,ty);
-    }
-
-  });
+  RAJA::kernel_param<Pol>(
+      RAJA::make_tuple(
+          RAJA::RangeSegment(0, inner_Dim0), RAJA::RangeSegment(0, inner_Dim1),
+          RAJA::RangeSegment(0, outer_Dim0), RAJA::RangeSegment(0, outer_Dim1)),
+      RAJA::make_tuple(myTile, myTile2),
+
+      // Load data into shared memory
+      [=] RAJA_HOST_DEVICE(int tx, int ty, int bx, int by, SharedTile& myTile,
+                           SharedTile& myTile2)
+      {
+        int col = bx * TILE_DIM + tx;  // Matrix column index
+        int row = by * TILE_DIM + ty;  // Matrix row index
+
+        if (row < N_rows && col < N_cols)
+        {
+          myTile(ty, tx)  = d_Aview(row, col);
+          myTile2(ty, tx) = d_Bview(row, col);
+        }
+      },
+
+      // read from shared mem
+      [=] RAJA_HOST_DEVICE(int tx, int ty, int bx, int by, SharedTile& myTile,
+                           SharedTile& myTile2)
+      {
+        int col = by * TILE_DIM + tx;  // Transposed matrix column index
+        int row = bx * TILE_DIM + ty;  // Transposed matrix row index
+
+        if (row < N_cols && col < N_rows)
+        {
+          d_Atview(row, col) = myTile(tx, ty);
+          d_Btview(row, col) = myTile2(tx, ty);
+        }
+      });
 
   hipMemcpy(At, d_At, N_rows * N_cols * sizeof(double), hipMemcpyDeviceToHost);
   hipMemcpy(Bt, d_Bt, N_rows * N_cols * sizeof(double), hipMemcpyDeviceToHost);
 
-  //Check result
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col = 0; col < N_cols; ++col) {
-      ASSERT_FLOAT_EQ(Atview(col,row), col);
-      ASSERT_FLOAT_EQ(Btview(col,row), col);
+  // Check result
+  for (int row = 0; row < N_rows; ++row)
+  {
+    for (int col = 0; col < N_cols; ++col)
+    {
+      ASSERT_FLOAT_EQ(Atview(col, row), col);
+      ASSERT_FLOAT_EQ(Btview(col, row), col);
     }
   }
 
@@ -448,149 +491,170 @@ GPU_TYPED_TEST_P(MatTranspose_gpu, Basic)
   hipFree(d_At);
   hipFree(d_B);
   hipFree(d_Bt);
-  delete [] A;
-  delete [] At;
-  delete [] B;
-  delete [] Bt;
+  delete[] A;
+  delete[] At;
+  delete[] B;
+  delete[] Bt;
 }
 
 REGISTER_TYPED_TEST_SUITE_P(MatTranspose_gpu, Basic);
 
-#endif //defined(RAJA_ENABLE_HIP)
+#endif  // defined(RAJA_ENABLE_HIP)
 
 using SeqTypes =
-  ::testing::Types<
-  RAJA::list<
-    RAJA::KernelPolicy<
-        RAJA::statement::For<3, RAJA::seq_exec,
-          RAJA::statement::For<2, RAJA::seq_exec,
-
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1>,
-
-              //Load data into shared memory
-              RAJA::statement::For<1, RAJA::seq_exec,
-                RAJA::statement::For<0, RAJA::seq_exec,
-                  RAJA::statement::Lambda<0>
-                                   >
-                                 >,
-
-                //Read data from shared memory
-                RAJA::statement::For<1, RAJA::seq_exec,
-                  RAJA::statement::For<0, RAJA::seq_exec,
-                    RAJA::statement::Lambda<1> > >
-
-              > //close shared memory scope
-            >//for 2
-        >//for 3
-      > //kernel policy
-    > //list
-  >; //types
+    ::testing::Types<RAJA::list<RAJA::KernelPolicy<RAJA::statement::For<
+        3,
+        RAJA::seq_exec,
+        RAJA::statement::For<
+            2,
+            RAJA::seq_exec,
+
+            RAJA::statement::InitLocalMem<
+                RAJA::cpu_tile_mem,
+                RAJA::ParamList<0, 1>,
+
+                // Load data into shared memory
+                RAJA::statement::For<
+                    1,
+                    RAJA::seq_exec,
+                    RAJA::statement::
+                        For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>,
+
+                // Read data from shared memory
+                RAJA::statement::For<
+                    1,
+                    RAJA::seq_exec,
+                    RAJA::statement::For<0,
+                                         RAJA::seq_exec,
+                                         RAJA::statement::Lambda<1>>>
+
+                >  // close shared memory scope
+            >      // for 2
+        >          // for 3
+                                                   >  // kernel policy
+                                >                     // list
+                     >;                               // types
 INSTANTIATE_TYPED_TEST_SUITE_P(Seq, MatTranspose, SeqTypes);
 INSTANTIATE_TYPED_TEST_SUITE_P(Seq, TypedLocalMem, SeqTypes);
 
 
 #if defined(RAJA_ENABLE_OPENMP)
-using TestTypes =
-  ::testing::Types<
-  RAJA::list<
-    RAJA::KernelPolicy<
-      RAJA::statement::For<3, RAJA::seq_exec,
-        RAJA::statement::For<2, RAJA::seq_exec,
-
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1>,
-
-           //Load data into shared memory
-           RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                     RAJA::ArgList<0, 1>,
-                                     RAJA::statement::Lambda<0>
-                                     >,
-
-           //Read data from shared memory
-           RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                     RAJA::ArgList<0, 1>,
-                                     RAJA::statement::Lambda<1>
-                                     >
-                                 >
-        >//for 2
-       >//for 3
-       > //close policy
-     > //close list
-
-  ,RAJA::list<
-      RAJA::KernelPolicy<
-      RAJA::statement::For<3, RAJA::seq_exec,
-        RAJA::statement::For<2, RAJA::seq_exec,
-
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1>,
-
-           //Load data into shared memory
-            RAJA::statement::For<1, RAJA::omp_parallel_for_exec,
-              RAJA::statement::For<0, RAJA::seq_exec,
-                RAJA::statement::Lambda<0>
-              >
-             >,
-
-           //Read data from shared memory
-            RAJA::statement::For<1, RAJA::seq_exec,
-           RAJA::statement::For<0, RAJA::omp_parallel_for_exec,
-                                RAJA::statement::Lambda<1>
-           >
-          >
-         > //close shared mem window
-        > //2
-       >//3
-     >//close policy
-    > //close list
-  ,RAJA::list<
-    RAJA::KernelPolicy<
-      RAJA::statement::For<3, RAJA::omp_parallel_for_exec,
-        RAJA::statement::For<2, RAJA::seq_exec,
-
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1>,
-
-           //Load data into shared memory
-           RAJA::statement::For<1, RAJA::seq_exec,
-              RAJA::statement::For<0, RAJA::seq_exec,
-                RAJA::statement::Lambda<0>
-              >
-             >,
-
-           //Read data from shared memory
-            RAJA::statement::For<1, RAJA::seq_exec,
-              RAJA::statement::For<0, RAJA::seq_exec,
-                RAJA::statement::Lambda<1>
-           >
-          >
-         > //close shared mem window
-        > //2
-       >//3
-      > //close policy list
-     > //close list
-  ,RAJA::list<
-    RAJA::KernelPolicy<
-           RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                     RAJA::ArgList<2, 3>,
-
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem,RAJA::ParamList<0,1>,
-
-           //Load data into shared memory
-           RAJA::statement::For<1, RAJA::seq_exec,
-              RAJA::statement::For<0, RAJA::seq_exec,
-                RAJA::statement::Lambda<0>
-              >
-             >,
-
-           //Read data from shared memory
-            RAJA::statement::For<1, RAJA::seq_exec,
-              RAJA::statement::For<0, RAJA::seq_exec,
-                RAJA::statement::Lambda<1>
-           >
-          >
-         > //close shared mem window
-       >//outer collapsed
-      > //close policy list
-     > //close list
-   >;
+using TestTypes = ::testing::Types<
+    RAJA::list<RAJA::KernelPolicy<RAJA::statement::For<
+        3,
+        RAJA::seq_exec,
+        RAJA::statement::For<
+            2,
+            RAJA::seq_exec,
+
+            RAJA::statement::InitLocalMem<
+                RAJA::cpu_tile_mem,
+                RAJA::ParamList<0, 1>,
+
+                // Load data into shared memory
+                RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
+                                          RAJA::ArgList<0, 1>,
+                                          RAJA::statement::Lambda<0>>,
+
+                // Read data from shared memory
+                RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
+                                          RAJA::ArgList<0, 1>,
+                                          RAJA::statement::Lambda<1>>>>  // for
+                                                                         // 2
+        >                            // for 3
+                                  >  // close policy
+               >                     // close list
+
+    ,
+    RAJA::list<RAJA::KernelPolicy<RAJA::statement::For<
+        3,
+        RAJA::seq_exec,
+        RAJA::statement::For<
+            2,
+            RAJA::seq_exec,
+
+            RAJA::statement::InitLocalMem<
+                RAJA::cpu_tile_mem,
+                RAJA::ParamList<0, 1>,
+
+                // Load data into shared memory
+                RAJA::statement::For<
+                    1,
+                    RAJA::omp_parallel_for_exec,
+                    RAJA::statement::
+                        For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>,
+
+                // Read data from shared memory
+                RAJA::statement::For<
+                    1,
+                    RAJA::seq_exec,
+                    RAJA::statement::For<0,
+                                         RAJA::omp_parallel_for_exec,
+                                         RAJA::statement::Lambda<
+                                             1>>>>  // close shared mem window
+            >                                       // 2
+        >                                           // 3
+                                  >                 // close policy
+               >                                    // close list
+    ,
+    RAJA::list<RAJA::KernelPolicy<RAJA::statement::For<
+        3,
+        RAJA::omp_parallel_for_exec,
+        RAJA::statement::For<
+            2,
+            RAJA::seq_exec,
+
+            RAJA::statement::InitLocalMem<
+                RAJA::cpu_tile_mem,
+                RAJA::ParamList<0, 1>,
+
+                // Load data into shared memory
+                RAJA::statement::For<
+                    1,
+                    RAJA::seq_exec,
+                    RAJA::statement::
+                        For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>,
+
+                // Read data from shared memory
+                RAJA::statement::For<
+                    1,
+                    RAJA::seq_exec,
+                    RAJA::statement::For<0,
+                                         RAJA::seq_exec,
+                                         RAJA::statement::Lambda<
+                                             1>>>>  // close shared mem window
+            >                                       // 2
+        >                                           // 3
+                                  >                 // close policy list
+               >                                    // close list
+    ,
+    RAJA::list<RAJA::KernelPolicy<RAJA::statement::Collapse<
+        RAJA::omp_parallel_collapse_exec,
+        RAJA::ArgList<2, 3>,
+
+        RAJA::statement::InitLocalMem<
+            RAJA::cpu_tile_mem,
+            RAJA::ParamList<0, 1>,
+
+            // Load data into shared memory
+            RAJA::statement::For<
+                1,
+                RAJA::seq_exec,
+                RAJA::statement::
+                    For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>,
+
+            // Read data from shared memory
+            RAJA::statement::For<1,
+                                 RAJA::seq_exec,
+                                 RAJA::statement::For<0,
+                                                      RAJA::seq_exec,
+                                                      RAJA::statement::Lambda<
+                                                          1>>>>  // close shared
+                                                                 // mem window
+        >                            // outer collapsed
+                                  >  // close policy list
+               >                     // close list
+    >;
 
 
 INSTANTIATE_TYPED_TEST_SUITE_P(OpenMP, MatTranspose, TestTypes);
@@ -599,60 +663,79 @@ INSTANTIATE_TYPED_TEST_SUITE_P(OpenMP, TypedLocalMem, TestTypes);
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using CUDATypes =
-  ::testing::Types<
-  RAJA::list<
-    RAJA::KernelPolicy<
-      RAJA::statement::CudaKernel<
-        RAJA::statement::For<3, RAJA::cuda_block_y_direct,
-          RAJA::statement::For<2, RAJA::cuda_block_x_direct,
-
-            RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<0,1>,
-
-              //Load data into shared memory
-              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
-                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                  RAJA::statement::Lambda<0> > >,
-              RAJA::statement::CudaSyncThreads,
-
-              //Read data from shared memory
-              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
-                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                  RAJA::statement::Lambda<1> > >,
-              RAJA::statement::CudaSyncThreads
-            > //close shared memory scope
-          >//for 2
-        >//for 3
-      > //CudaKernel
-    > //kernel policy
-  > //list
-  ,
-  RAJA::list<
-    RAJA::KernelPolicy<
-      RAJA::statement::CudaKernel<
-        RAJA::statement::For<3, RAJA::cuda_block_y_loop,
-          RAJA::statement::For<2, RAJA::cuda_block_x_loop,
-
-            RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<0,1>,
-
-              //Load data into shared memory
-              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
-                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                  RAJA::statement::Lambda<0> > >,
-              RAJA::statement::CudaSyncThreads,
-
-              //Read data from shared memory
-              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
-                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                  RAJA::statement::Lambda<1> > >,
-              RAJA::statement::CudaSyncThreads
-            > //close shared memory scope
-          >//for 2
-        >//for 3
-      > //CudaKernel
-    > //kernel policy
-  > //list
-  >; //types
+using CUDATypes = ::testing::Types<
+    RAJA::list<
+        RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
+            3,
+            RAJA::cuda_block_y_direct,
+            RAJA::statement::For<
+                2,
+                RAJA::cuda_block_x_direct,
+
+                RAJA::statement::InitLocalMem<
+                    RAJA::cuda_shared_mem,
+                    RAJA::ParamList<0, 1>,
+
+                    // Load data into shared memory
+                    RAJA::statement::For<
+                        1,
+                        RAJA::cuda_thread_y_direct,
+                        RAJA::statement::For<0,
+                                             RAJA::cuda_thread_x_direct,
+                                             RAJA::statement::Lambda<0>>>,
+                    RAJA::statement::CudaSyncThreads,
+
+                    // Read data from shared memory
+                    RAJA::statement::For<
+                        1,
+                        RAJA::cuda_thread_y_direct,
+                        RAJA::statement::For<0,
+                                             RAJA::cuda_thread_x_direct,
+                                             RAJA::statement::Lambda<1>>>,
+                    RAJA::statement::CudaSyncThreads>     // close shared memory
+                                                          // scope
+                >                                         // for 2
+            >                                             // for 3
+                                                       >  // CudaKernel
+                           >                              // kernel policy
+        >                                                 // list
+    ,
+    RAJA::list<
+        RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
+            3,
+            RAJA::cuda_block_y_loop,
+            RAJA::statement::For<
+                2,
+                RAJA::cuda_block_x_loop,
+
+                RAJA::statement::InitLocalMem<
+                    RAJA::cuda_shared_mem,
+                    RAJA::ParamList<0, 1>,
+
+                    // Load data into shared memory
+                    RAJA::statement::For<
+                        1,
+                        RAJA::cuda_thread_y_direct,
+                        RAJA::statement::For<0,
+                                             RAJA::cuda_thread_x_direct,
+                                             RAJA::statement::Lambda<0>>>,
+                    RAJA::statement::CudaSyncThreads,
+
+                    // Read data from shared memory
+                    RAJA::statement::For<
+                        1,
+                        RAJA::cuda_thread_y_direct,
+                        RAJA::statement::For<0,
+                                             RAJA::cuda_thread_x_direct,
+                                             RAJA::statement::Lambda<1>>>,
+                    RAJA::statement::CudaSyncThreads>     // close shared memory
+                                                          // scope
+                >                                         // for 2
+            >                                             // for 3
+                                                       >  // CudaKernel
+                           >                              // kernel policy
+        >                                                 // list
+    >;                                                    // types
 INSTANTIATE_TYPED_TEST_SUITE_P(CUDA, MatTranspose, CUDATypes);
 INSTANTIATE_TYPED_TEST_SUITE_P(CUDA, TypedLocalMem, CUDATypes);
 
@@ -660,80 +743,90 @@ INSTANTIATE_TYPED_TEST_SUITE_P(CUDA, TypedLocalMem, CUDATypes);
 
 #if defined(RAJA_ENABLE_HIP)
 
-using HIPTypes =
-  ::testing::Types<
-  RAJA::list<
-    RAJA::KernelPolicy<
-      RAJA::statement::HipKernel<
-        RAJA::statement::For<3, RAJA::hip_block_y_direct,
-          RAJA::statement::For<2, RAJA::hip_block_x_direct,
-
-            RAJA::statement::InitLocalMem<RAJA::hip_shared_mem, RAJA::ParamList<0,1>,
-
-              //Load data into shared memory
-              RAJA::statement::For<1, RAJA::hip_thread_y_direct,
-                RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-                  RAJA::statement::Lambda<0>
-                >
-              >,
-              RAJA::statement::HipSyncThreads,
-
-              //Read data from shared memory
-              RAJA::statement::For<1, RAJA::hip_thread_y_direct,
-                RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-                  RAJA::statement::Lambda<1>
-                >
-              >,
-              RAJA::statement::HipSyncThreads
-            > //close shared memory scope
-          >//for 2
-        >//for 3
-      > //HipKernel
-    > //kernel policy
-  > //list
-  ,
-  RAJA::list<
-    RAJA::KernelPolicy<
-      RAJA::statement::HipKernel<
-        RAJA::statement::For<3, RAJA::hip_block_y_loop,
-          RAJA::statement::For<2, RAJA::hip_block_x_loop,
-
-            RAJA::statement::InitLocalMem<RAJA::hip_shared_mem, RAJA::ParamList<0,1>,
-
-              //Load data into shared memory
-              RAJA::statement::For<1, RAJA::hip_thread_y_direct,
-                RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-                  RAJA::statement::Lambda<0>
-                >
-              >,
-              RAJA::statement::HipSyncThreads,
-
-              //Read data from shared memory
-              RAJA::statement::For<1, RAJA::hip_thread_y_direct,
-                RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-                  RAJA::statement::Lambda<1>
-                >
-              >,
-              RAJA::statement::HipSyncThreads
-            > //close shared memory scope
-          >//for 2
-        >//for 3
-      > //HipKernel
-    > //kernel policy
-  > //list
-  >; //types
+using HIPTypes = ::testing::Types<
+    RAJA::list<
+        RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::For<
+            3,
+            RAJA::hip_block_y_direct,
+            RAJA::statement::For<
+                2,
+                RAJA::hip_block_x_direct,
+
+                RAJA::statement::InitLocalMem<
+                    RAJA::hip_shared_mem,
+                    RAJA::ParamList<0, 1>,
+
+                    // Load data into shared memory
+                    RAJA::statement::For<
+                        1,
+                        RAJA::hip_thread_y_direct,
+                        RAJA::statement::For<0,
+                                             RAJA::hip_thread_x_direct,
+                                             RAJA::statement::Lambda<0>>>,
+                    RAJA::statement::HipSyncThreads,
+
+                    // Read data from shared memory
+                    RAJA::statement::For<
+                        1,
+                        RAJA::hip_thread_y_direct,
+                        RAJA::statement::For<0,
+                                             RAJA::hip_thread_x_direct,
+                                             RAJA::statement::Lambda<1>>>,
+                    RAJA::statement::HipSyncThreads>     // close shared memory
+                                                         // scope
+                >                                        // for 2
+            >                                            // for 3
+                                                      >  // HipKernel
+                           >                             // kernel policy
+        >                                                // list
+    ,
+    RAJA::list<
+        RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::For<
+            3,
+            RAJA::hip_block_y_loop,
+            RAJA::statement::For<
+                2,
+                RAJA::hip_block_x_loop,
+
+                RAJA::statement::InitLocalMem<
+                    RAJA::hip_shared_mem,
+                    RAJA::ParamList<0, 1>,
+
+                    // Load data into shared memory
+                    RAJA::statement::For<
+                        1,
+                        RAJA::hip_thread_y_direct,
+                        RAJA::statement::For<0,
+                                             RAJA::hip_thread_x_direct,
+                                             RAJA::statement::Lambda<0>>>,
+                    RAJA::statement::HipSyncThreads,
+
+                    // Read data from shared memory
+                    RAJA::statement::For<
+                        1,
+                        RAJA::hip_thread_y_direct,
+                        RAJA::statement::For<0,
+                                             RAJA::hip_thread_x_direct,
+                                             RAJA::statement::Lambda<1>>>,
+                    RAJA::statement::HipSyncThreads>     // close shared memory
+                                                         // scope
+                >                                        // for 2
+            >                                            // for 3
+                                                      >  // HipKernel
+                           >                             // kernel policy
+        >                                                // list
+    >;                                                   // types
 INSTANTIATE_TYPED_TEST_SUITE_P(HIP, MatTranspose_gpu, HIPTypes);
 INSTANTIATE_TYPED_TEST_SUITE_P(HIP, TypedLocalMem_gpu, HIPTypes);
 
 #endif
 
 
-
 template <typename NestedPolicy>
 class MatMultiply : public ::testing::Test
 {
-  virtual void SetUp(){}
-  virtual void TearDown(){}
+  virtual void SetUp() {}
+  virtual void TearDown() {}
 };
 
 TYPED_TEST_SUITE_P(MatMultiply);
@@ -747,22 +840,22 @@ GPU_TYPED_TEST_P(MatMultiply, shmem)
   static constexpr size_t M = TypeParam::M;
   static constexpr size_t P = TypeParam::P;
 
-  //Matrix A size: N x M
-  //Matrix B size: M x P
-  //Result C size: N x P
+  // Matrix A size: N x M
+  // Matrix B size: M x P
+  // Result C size: N x P
 
   // Note: on CPU A==d_A, etc.
   double *A, *d_A;
-  TypeParam::alloc_double(N*M, &A, &d_A);
+  TypeParam::alloc_double(N * M, &A, &d_A);
 
   double *B, *d_B;
-  TypeParam::alloc_double(M*P, &B, &d_B);
+  TypeParam::alloc_double(M * P, &B, &d_B);
 
   double *C, *d_C;
-  TypeParam::alloc_double(N*P, &C, &d_C);
+  TypeParam::alloc_double(N * P, &C, &d_C);
 
 
-  double *C_sol = new double[N*P];
+  double* C_sol = new double[N * P];
 
   RAJA::View<double, RAJA::Layout<2>> C_solView(C_sol, N, P);
 
@@ -771,34 +864,41 @@ GPU_TYPED_TEST_P(MatMultiply, shmem)
     RAJA::View<double, RAJA::Layout<2>> Aview(A, N, M);
     RAJA::View<double, RAJA::Layout<2>> Bview(B, M, P);
     RAJA::View<double, RAJA::Layout<2>> Cview(C, N, P);
-    for (size_t row = 0; row < N; ++row) {
-      for (size_t col = 0; col < M; ++col) {
-        Aview(row, col) = ((double)col-row)/(N*M)+1;
+    for (size_t row = 0; row < N; ++row)
+    {
+      for (size_t col = 0; col < M; ++col)
+      {
+        Aview(row, col) = ((double)col - row) / (N * M) + 1;
       }
     }
 
-    for (size_t row = 0; row < M; ++row) {
-      for (size_t col = 0; col < P; ++col) {
-        Bview(row, col) = ((double)col+row)/(M*P)+1;
+    for (size_t row = 0; row < M; ++row)
+    {
+      for (size_t col = 0; col < P; ++col)
+      {
+        Bview(row, col) = ((double)col + row) / (M * P) + 1;
       }
     }
 
-    for(size_t r=0; r<N; ++r){
-      for(size_t c=0; c<P; ++c){
+    for (size_t r = 0; r < N; ++r)
+    {
+      for (size_t c = 0; c < P; ++c)
+      {
         double dot = 0.0;
-        for(size_t k=0; k<M; ++k){
-          dot += Aview(r,k)*Bview(k,c);
+        for (size_t k = 0; k < M; ++k)
+        {
+          dot += Aview(r, k) * Bview(k, c);
         }
-        C_solView(r,c) = dot;
-        Cview(r,c) = 0;
+        C_solView(r, c) = dot;
+        Cview(r, c)     = 0;
       }
     }
   }
 
   // Copy A, B and C to the device (NOP on CPU)
-  TypeParam::copy_d2h(N*M, d_A, A);
-  TypeParam::copy_d2h(M*P, d_B, B);
-  TypeParam::copy_d2h(N*P, d_C, C);
+  TypeParam::copy_d2h(N * M, d_A, A);
+  TypeParam::copy_d2h(M * P, d_B, B);
+  TypeParam::copy_d2h(N * P, d_C, C);
 
   // Create device views of data
   RAJA::View<double, RAJA::Layout<2>> Aview(d_A, N, M);
@@ -808,160 +908,172 @@ GPU_TYPED_TEST_P(MatMultiply, shmem)
   using Shmem      = typename TypeParam::Shmem;
   using ThreadPriv = typename TypeParam::ThreadPriv;
 
-  Shmem aShared, bShared; //memory to be shared between threads
-  ThreadPriv pVal; //iteration dependent data
-
-  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::RangeSegment(0, N),
-                                           RAJA::RangeSegment(0, M),
-                                           RAJA::RangeSegment(0, P)),
-                          RAJA::make_tuple(aShared, bShared, pVal),
-
-  // Zero out thread local memory for storing dot products
-  [=] RAJA_HOST_DEVICE (int tn, int tp, ThreadPriv &pVal) {
-
-    pVal(tn,tp) = 0.0;
-
-  },
-
-  // Load tile of A
-  [=] RAJA_HOST_DEVICE (int n, int m, int tn, int tm, Shmem &aShared) {
-
-     aShared(tn, tm) = Aview(n, m);
-
-  },
+  Shmem aShared, bShared;  // memory to be shared between threads
+  ThreadPriv pVal;         // iteration dependent data
 
-  // Load tile of B
-  [=] RAJA_HOST_DEVICE (int m, int p, int tm, int tp, Shmem &bShared) {
+  RAJA::kernel_param<Pol>(
+      RAJA::make_tuple(RAJA::RangeSegment(0, N), RAJA::RangeSegment(0, M),
+                       RAJA::RangeSegment(0, P)),
+      RAJA::make_tuple(aShared, bShared, pVal),
 
-    bShared(tm, tp) = Bview(m, p);
+      // Zero out thread local memory for storing dot products
+      [=] RAJA_HOST_DEVICE(int tn, int tp, ThreadPriv& pVal)
+      { pVal(tn, tp) = 0.0; },
 
-  },
+      // Load tile of A
+      [=] RAJA_HOST_DEVICE(int n, int m, int tn, int tm, Shmem& aShared)
+      { aShared(tn, tm) = Aview(n, m); },
 
-  // Do partial update in shmem
-  [=] RAJA_HOST_DEVICE (int tn, int tm, int tp, Shmem &aShared,  Shmem &bShared, ThreadPriv & pVal) {
+      // Load tile of B
+      [=] RAJA_HOST_DEVICE(int m, int p, int tm, int tp, Shmem& bShared)
+      { bShared(tm, tp) = Bview(m, p); },
 
-    pVal(tn,tp) += aShared(tn,tm) * bShared(tm, tp);
+      // Do partial update in shmem
+      [=] RAJA_HOST_DEVICE(int tn, int tm, int tp, Shmem& aShared,
+                           Shmem& bShared, ThreadPriv& pVal)
+      { pVal(tn, tp) += aShared(tn, tm) * bShared(tm, tp); },
 
-  },
-
-  // Write out complete result
-  [=] RAJA_HOST_DEVICE (int n, int p, int tn, int tp,  ThreadPriv &pVal) {
-
-    Cview(n,p) = pVal(tn,tp);
-
-  });
+      // Write out complete result
+      [=] RAJA_HOST_DEVICE(int n, int p, int tn, int tp, ThreadPriv& pVal)
+      { Cview(n, p) = pVal(tn, tp); });
 
   // copy result back to host (NOP on CPU)
-  TypeParam::copy_d2h(N*P, C, d_C);
+  TypeParam::copy_d2h(N * P, C, d_C);
 
   // Check result
   RAJA::View<double, RAJA::Layout<2>> Cresult(C, N, P);
-  for (size_t row = 0; row < N; ++row) {
-    for (size_t col = 0; col < P; ++col) {
-      ASSERT_FLOAT_EQ((double)Cresult(row,col), (double)C_solView(row,col));
+  for (size_t row = 0; row < N; ++row)
+  {
+    for (size_t col = 0; col < P; ++col)
+    {
+      ASSERT_FLOAT_EQ((double)Cresult(row, col), (double)C_solView(row, col));
     }
   }
 
   TypeParam::free_double(A, d_A);
   TypeParam::free_double(B, d_B);
   TypeParam::free_double(C, d_C);
-  delete [] C_sol;
+  delete[] C_sol;
 }
 
 REGISTER_TYPED_TEST_SUITE_P(MatMultiply, shmem);
 
-void alloc_cpu(size_t N, double **host, double **device){
-  *host = new double[N];
+void alloc_cpu(size_t N, double** host, double** device)
+{
+  *host   = new double[N];
   *device = *host;
 }
 
-void copy_h2d_cpu(size_t , double *, double *){
+void copy_h2d_cpu(size_t, double*, double*)
+{
   // NOP
 }
 
-void copy_d2h_cpu(size_t , double *, double *){
+void copy_d2h_cpu(size_t, double*, double*)
+{
   // NOP
 }
 
-void free_cpu(double *host, double *){
-  delete[] host;
-}
+void free_cpu(double* host, double*) { delete[] host; }
+
+struct Policy_MatMultiply_cpu
+{
 
-struct Policy_MatMultiply_cpu {
-
-    static constexpr size_t N = 150;
-    static constexpr size_t M = 25;
-    static constexpr size_t P = 95;
-    static constexpr size_t tile_size = 16;
-
-    constexpr static void(*alloc_double)(size_t, double**, double**) = alloc_cpu;
-    constexpr static void(*copy_h2d)(size_t, double*, double*) = copy_h2d_cpu;
-    constexpr static void(*copy_d2h)(size_t, double*, double*) = copy_d2h_cpu;
-    constexpr static void(*free_double)(double*, double*) = free_cpu;
-
-    using Shmem      = RAJA::LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<tile_size, tile_size>>;
-    using ThreadPriv = RAJA::LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<tile_size, tile_size>>;
-
-    using shmem_Lambda0 = RAJA::statement::Lambda<0, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
-    using shmem_Lambda1 = RAJA::statement::Lambda<1, RAJA::Segs<0, 1>, RAJA::Offsets<0, 1>, RAJA::Params<0>>;
-    using shmem_Lambda2 = RAJA::statement::Lambda<2, RAJA::Segs<1, 2>, RAJA::Offsets<1, 2>, RAJA::Params<1>>;
-    using shmem_Lambda3 = RAJA::statement::Lambda<3, RAJA::Offsets<0, 1, 2>, RAJA::Params<0, 1, 2>>;
-    using shmem_Lambda4 = RAJA::statement::Lambda<4, RAJA::Segs<0, 2>, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
-
-    // Segments:
-    // 0: N
-    // 1: M
-    // 2: P
-
-    using exec_policy =
-        RAJA::KernelPolicy<
-          //Initalize thread private value
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<2,1,0>,
-
-            // Tile of N and P (the result matrix C)
-            RAJA::statement::Tile<0, RAJA::tile_fixed<tile_size>, RAJA::seq_exec,
-              RAJA::statement::Tile<2, RAJA::tile_fixed<tile_size>, RAJA::seq_exec,
-
-               // zero out shmem tile of C
-               RAJA::statement::For<2, RAJA::seq_exec,
-                  RAJA::statement::For<0, RAJA::seq_exec,
-                  shmem_Lambda0 > >,
-
-                // Slide window across matrix: Tile in M
-                RAJA::statement::Tile<1, RAJA::tile_fixed<tile_size>, RAJA::seq_exec,
-
-                   // Load tile of A into shmem
-                   RAJA::statement::For<1, RAJA::seq_exec,
-                     RAJA::statement::For<0, RAJA::seq_exec,
-                     shmem_Lambda1
-                    >
-                   >,
-
-                   // Load tile of B into shmem
-                   RAJA::statement::For<2, RAJA::seq_exec,
-                     RAJA::statement::For<1, RAJA::seq_exec,
-                     shmem_Lambda2
-                    >
-                   >,
-
-                   //Partial multiplication
-                   RAJA::statement::For<2, RAJA::seq_exec,
-                     RAJA::statement::For<1, RAJA::seq_exec,
-                       RAJA::statement::For<0, RAJA::seq_exec,
-                       shmem_Lambda3
-                       >
-                     >
-                   >
-                >, //sliding window
-
-                //Write memory out to global matrix
-                RAJA::statement::For<2, RAJA::seq_exec,
-                  RAJA::statement::For<0, RAJA::seq_exec,
-                  shmem_Lambda4 > >
-             >
-            >
-           > //Create shared memory
-          >;
+  static constexpr size_t N         = 150;
+  static constexpr size_t M         = 25;
+  static constexpr size_t P         = 95;
+  static constexpr size_t tile_size = 16;
+
+  constexpr static void (*alloc_double)(size_t, double**, double**) = alloc_cpu;
+  constexpr static void (*copy_h2d)(size_t, double*, double*) = copy_h2d_cpu;
+  constexpr static void (*copy_d2h)(size_t, double*, double*) = copy_d2h_cpu;
+  constexpr static void (*free_double)(double*, double*)      = free_cpu;
+
+  using Shmem = RAJA::
+      LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<tile_size, tile_size>>;
+  using ThreadPriv = RAJA::
+      LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<tile_size, tile_size>>;
+
+  using shmem_Lambda0 =
+      RAJA::statement::Lambda<0, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
+  using shmem_Lambda1 = RAJA::statement::
+      Lambda<1, RAJA::Segs<0, 1>, RAJA::Offsets<0, 1>, RAJA::Params<0>>;
+  using shmem_Lambda2 = RAJA::statement::
+      Lambda<2, RAJA::Segs<1, 2>, RAJA::Offsets<1, 2>, RAJA::Params<1>>;
+  using shmem_Lambda3 =
+      RAJA::statement::Lambda<3, RAJA::Offsets<0, 1, 2>, RAJA::Params<0, 1, 2>>;
+  using shmem_Lambda4 = RAJA::statement::
+      Lambda<4, RAJA::Segs<0, 2>, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
+
+  // Segments:
+  // 0: N
+  // 1: M
+  // 2: P
+
+  using exec_policy = RAJA::KernelPolicy<
+      // Initalize thread private value
+      RAJA::statement::InitLocalMem<
+          RAJA::cpu_tile_mem,
+          RAJA::ParamList<2, 1, 0>,
+
+          // Tile of N and P (the result matrix C)
+          RAJA::statement::Tile<
+              0,
+              RAJA::tile_fixed<tile_size>,
+              RAJA::seq_exec,
+              RAJA::statement::Tile<
+                  2,
+                  RAJA::tile_fixed<tile_size>,
+                  RAJA::seq_exec,
+
+                  // zero out shmem tile of C
+                  RAJA::statement::For<
+                      2,
+                      RAJA::seq_exec,
+                      RAJA::statement::For<0, RAJA::seq_exec, shmem_Lambda0>>,
+
+                  // Slide window across matrix: Tile in M
+                  RAJA::statement::Tile<
+                      1,
+                      RAJA::tile_fixed<tile_size>,
+                      RAJA::seq_exec,
+
+                      // Load tile of A into shmem
+                      RAJA::statement::For<1,
+                                           RAJA::seq_exec,
+                                           RAJA::statement::For<0,
+                                                                RAJA::seq_exec,
+                                                                shmem_Lambda1>>,
+
+                      // Load tile of B into shmem
+                      RAJA::statement::For<2,
+                                           RAJA::seq_exec,
+                                           RAJA::statement::For<1,
+                                                                RAJA::seq_exec,
+                                                                shmem_Lambda2>>,
+
+                      // Partial multiplication
+                      RAJA::statement::For<
+                          2,
+                          RAJA::seq_exec,
+                          RAJA::statement::For<
+                              1,
+                              RAJA::seq_exec,
+                              RAJA::statement::For<
+                                  0,
+                                  RAJA::seq_exec,
+                                  shmem_Lambda3>>>>,  // sliding
+                                                      // window
+
+                  // Write memory out to global matrix
+                  RAJA::statement::For<
+                      2,
+                      RAJA::seq_exec,
+                      RAJA::statement::For<0,
+                                           RAJA::seq_exec,
+                                           shmem_Lambda4>>>>>  // Create shared
+                                                               // memory
+      >;
 };
 
 using MatMultiplyTypes = ::testing::Types<Policy_MatMultiply_cpu>;
diff --git a/test/old-tests/unit/test-simd.cpp b/test/old-tests/unit/test-simd.cpp
index 72bd513fd8..bb4f7ab274 100644
--- a/test/old-tests/unit/test-simd.cpp
+++ b/test/old-tests/unit/test-simd.cpp
@@ -21,26 +21,28 @@ using namespace RAJA::statement;
 TEST(SIMD, Align)
 {
 
-  int N = 1024;
+  int N    = 1024;
   double c = 0.5;
-  double *a =
+  double* a =
       RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN, N * sizeof(double));
-  double *b =
+  double* b =
       RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN, N * sizeof(double));
 
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     a[i] = 0;
     b[i] = 2.0;
   }
 
 
-  double *y = RAJA::align_hint(a);
-  double *x = RAJA::align_hint(b);
+  double* y = RAJA::align_hint(a);
+  double* x = RAJA::align_hint(b);
 
   RAJA::forall<RAJA::simd_exec>(RAJA::RangeSegment(0, N),
                                 [=](int i) { y[i] += x[i] * c; });
 
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     ASSERT_DOUBLE_EQ((double)y[i], (double)1.0);
   }
 
@@ -53,33 +55,33 @@ TEST(SIMD, OMPAndSimd)
 {
 
   using POL = RAJA::KernelPolicy<RAJA::statement::For<
-      1,
-      RAJA::omp_parallel_for_exec,
-      RAJA::statement::For<0, RAJA::simd_exec, RAJA::statement::Lambda<0> > > >;
+      1, RAJA::omp_parallel_for_exec,
+      RAJA::statement::For<0, RAJA::simd_exec, RAJA::statement::Lambda<0>>>>;
 
   const RAJA::Index_type N = 32;
   const RAJA::Index_type M = 32;
 
-  double *a = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double* a = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                   N * M * sizeof(double));
-  double *b = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double* b = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                   N * M * sizeof(double));
-  double *c = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double* c = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                   N * M * sizeof(double));
 
-  for (int i = 0; i < N * M; ++i) {
+  for (int i = 0; i < N * M; ++i)
+  {
     a[i] = 1;
     b[i] = 1;
     c[i] = 0.0;
   }
 
-  RAJA::kernel<POL>(RAJA::make_tuple(RAJA::RangeSegment(0, N),
-                                     RAJA::RangeSegment(0, M)),
-                    [=](RAJA::Index_type i, RAJA::Index_type j) {
-                      c[i + j * N] = a[i + j * N] + b[i + j * N];
-                    });
+  RAJA::kernel<POL>(
+      RAJA::make_tuple(RAJA::RangeSegment(0, N), RAJA::RangeSegment(0, M)),
+      [=](RAJA::Index_type i, RAJA::Index_type j)
+      { c[i + j * N] = a[i + j * N] + b[i + j * N]; });
 
-  for (int i = 0; i < N * M; ++i) {
+  for (int i = 0; i < N * M; ++i)
+  {
     ASSERT_DOUBLE_EQ((double)c[i], (double)2.0);
   }
 
@@ -92,49 +94,46 @@ TEST(SIMD, OMPAndSimd_MultiLambda)
 {
 
   using POL = RAJA::KernelPolicy<RAJA::statement::For<
-      1,
-      RAJA::omp_parallel_for_exec,
-      RAJA::statement::For<0,
-                           RAJA::simd_exec,
-                           RAJA::statement::Lambda<0>,
-                           RAJA::statement::Lambda<1> > > >;
+      1, RAJA::omp_parallel_for_exec,
+      RAJA::statement::For<0, RAJA::simd_exec, RAJA::statement::Lambda<0>,
+                           RAJA::statement::Lambda<1>>>>;
 
   const RAJA::Index_type N = 32;
   const RAJA::Index_type M = 32;
 
-  double *a = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double* a = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                   N * M * sizeof(double));
-  double *b = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double* b = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                   N * M * sizeof(double));
-  double *c = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double* c = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                   N * M * sizeof(double));
 
-  double *a2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double* a2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                    N * M * sizeof(double));
-  double *b2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double* b2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                    N * M * sizeof(double));
-  double *c2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double* c2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                    N * M * sizeof(double));
 
-  for (int i = 0; i < N * M; ++i) {
-    a[i] = 1;
-    b[i] = 1;
-    c[i] = 0.0;
+  for (int i = 0; i < N * M; ++i)
+  {
+    a[i]  = 1;
+    b[i]  = 1;
+    c[i]  = 0.0;
     a2[i] = 1;
     b2[i] = 1;
     c2[i] = 0.0;
   }
 
-  RAJA::kernel<POL>(RAJA::make_tuple(RAJA::RangeSegment(0, N),
-                                     RAJA::RangeSegment(0, M)),
-                    [=](RAJA::Index_type i, RAJA::Index_type j) {
-                      c[i + j * N] = a[i + j * N] + b[i + j * N];
-                    },
-                    [=](RAJA::Index_type i, RAJA::Index_type j) {
-                      c2[i + j * N] = a2[i + j * N] + b2[i + j * N];
-                    });
+  RAJA::kernel<POL>(
+      RAJA::make_tuple(RAJA::RangeSegment(0, N), RAJA::RangeSegment(0, M)),
+      [=](RAJA::Index_type i, RAJA::Index_type j)
+      { c[i + j * N] = a[i + j * N] + b[i + j * N]; },
+      [=](RAJA::Index_type i, RAJA::Index_type j)
+      { c2[i + j * N] = a2[i + j * N] + b2[i + j * N]; });
 
-  for (int i = 0; i < N * M; ++i) {
+  for (int i = 0; i < N * M; ++i)
+  {
     ASSERT_DOUBLE_EQ((double)c[i], (double)2.0);
     ASSERT_DOUBLE_EQ((double)c2[i], (double)2.0);
   }
diff --git a/test/unit/algorithm/test-algorithm-util-for_each.cpp b/test/unit/algorithm/test-algorithm-util-for_each.cpp
index db918ad234..5bd0d3a612 100644
--- a/test/unit/algorithm/test-algorithm-util-for_each.cpp
+++ b/test/unit/algorithm/test-algorithm-util-for_each.cpp
@@ -19,8 +19,9 @@
 #include <vector>
 #include <set>
 
-template<typename T>
-class ForEachUnitTest : public ::testing::Test {};
+template <typename T>
+class ForEachUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(ForEachUnitTest, UnitIndexTypes);
 
@@ -30,10 +31,12 @@ TYPED_TEST(ForEachUnitTest, EmptyRange)
   std::vector<TypeParam> numbers;
 
   std::vector<TypeParam> copies;
-  RAJA::for_each(numbers, [&](TypeParam& number) {
-    number += 1;
-    copies.push_back(number);
-  });
+  RAJA::for_each(numbers,
+                 [&](TypeParam& number)
+                 {
+                   number += 1;
+                   copies.push_back(number);
+                 });
 
   ASSERT_EQ(copies.size(), 0);
   ASSERT_EQ(numbers.size(), 0);
@@ -42,55 +45,64 @@ TYPED_TEST(ForEachUnitTest, EmptyRange)
 TYPED_TEST(ForEachUnitTest, VectorRange)
 {
   std::vector<TypeParam> numbers;
-  for (TypeParam i = 0; i < 13; ++i) {
+  for (TypeParam i = 0; i < 13; ++i)
+  {
     numbers.push_back(i);
   }
 
   std::vector<TypeParam> copies;
-  RAJA::for_each(numbers, [&](TypeParam& number) {
-    copies.push_back(number);
-    number += 1;
-  });
+  RAJA::for_each(numbers,
+                 [&](TypeParam& number)
+                 {
+                   copies.push_back(number);
+                   number += 1;
+                 });
 
   ASSERT_EQ(copies.size(), 13);
-  for (TypeParam i = 0; i < 13; ++i) {
-    ASSERT_EQ(numbers[i], copies[i]+1);
+  for (TypeParam i = 0; i < 13; ++i)
+  {
+    ASSERT_EQ(numbers[i], copies[i] + 1);
   }
 }
 
 TYPED_TEST(ForEachUnitTest, RajaSpanRange)
 {
   std::vector<TypeParam> numbers;
-  for (TypeParam i = 0; i < 11; ++i) {
+  for (TypeParam i = 0; i < 11; ++i)
+  {
     numbers.push_back(i);
   }
 
   std::vector<TypeParam> copies;
-  RAJA::for_each(RAJA::make_span(numbers.data(), 11), [&](TypeParam& number) {
-    copies.push_back(number);
-    number += 1;
-  });
+  RAJA::for_each(RAJA::make_span(numbers.data(), 11),
+                 [&](TypeParam& number)
+                 {
+                   copies.push_back(number);
+                   number += 1;
+                 });
 
   ASSERT_EQ(copies.size(), 11);
-  for (TypeParam i = 0; i < 11; ++i) {
-    ASSERT_EQ(numbers[i], copies[i]+1);
+  for (TypeParam i = 0; i < 11; ++i)
+  {
+    ASSERT_EQ(numbers[i], copies[i] + 1);
   }
 }
 
 TYPED_TEST(ForEachUnitTest, SetRange)
 {
   std::set<TypeParam> numbers;
-  for (TypeParam i = 0; i < 6; ++i) {
+  for (TypeParam i = 0; i < 6; ++i)
+  {
     numbers.insert(i);
   }
 
   std::vector<TypeParam> copies;
-  RAJA::for_each(numbers, [&](TypeParam const& number) {
-    copies.push_back(number);
-  });
+  RAJA::for_each(numbers,
+                 [&](TypeParam const& number) { copies.push_back(number); });
 
   ASSERT_EQ(copies.size(), 6);
-  for (TypeParam i = 0; i < 6; ++i) {
+  for (TypeParam i = 0; i < 6; ++i)
+  {
     ASSERT_EQ(i, copies[i]);
     ASSERT_EQ(numbers.count(i), 1);
   }
@@ -102,22 +114,21 @@ TYPED_TEST(ForEachUnitTest, EmptyTypeList)
   using numbers = camp::list<>;
 
   std::vector<TypeParam> copies;
-  RAJA::for_each_type(numbers{}, [&](auto number) {
-    copies.push_back(number);
-  });
+  RAJA::for_each_type(numbers {},
+                      [&](auto number) { copies.push_back(number); });
 
   ASSERT_EQ(copies.size(), 0);
 }
 
 
-template < typename T, T val >
+template <typename T, T val>
 T get_num(std::integral_constant<T, val>)
 {
   return val;
 }
 
-template < typename TypeParam,
-           std::enable_if_t<std::is_integral<TypeParam>::value>* = nullptr >
+template <typename TypeParam,
+          std::enable_if_t<std::is_integral<TypeParam>::value>* = nullptr>
 void run_int_type_test()
 {
   using numbers = camp::list<std::integral_constant<TypeParam, 0>,
@@ -127,24 +138,21 @@ void run_int_type_test()
                              std::integral_constant<TypeParam, 4>>;
 
   std::vector<TypeParam> copies;
-  RAJA::for_each_type(numbers{}, [&](auto number) {
-    copies.push_back(get_num(number));
-  });
+  RAJA::for_each_type(numbers {},
+                      [&](auto number) { copies.push_back(get_num(number)); });
 
   ASSERT_EQ(copies.size(), 5);
-  for (TypeParam i = 0; i < 5; ++i) {
+  for (TypeParam i = 0; i < 5; ++i)
+  {
     ASSERT_EQ(i, copies[i]);
   }
 }
 ///
-template < typename TypeParam,
-           std::enable_if_t<!std::is_integral<TypeParam>::value>* = nullptr >
+template <typename TypeParam,
+          std::enable_if_t<!std::is_integral<TypeParam>::value>* = nullptr>
 void run_int_type_test()
 {
   // ignore non-ints
 }
 
-TYPED_TEST(ForEachUnitTest, IntTypeList)
-{
-  run_int_type_test<TypeParam>();
-}
+TYPED_TEST(ForEachUnitTest, IntTypeList) { run_int_type_test<TypeParam>(); }
diff --git a/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
index 4e3f9fb795..bacec1a905 100644
--- a/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
@@ -37,18 +37,24 @@
 
 
 // tag classes to differentiate reduce by attributes and apply correct testing
-struct left_fold_reduce_tag { };
-struct unordered_reduce_tag { };
+struct left_fold_reduce_tag
+{};
+struct unordered_reduce_tag
+{};
 
-struct reduce_interface_tag { };
+struct reduce_interface_tag
+{};
 
-struct reduce_default_interface_tag { };
-struct reduce_init_interface_tag { };
-struct reduce_init_op_interface_tag { };
+struct reduce_default_interface_tag
+{};
+struct reduce_init_interface_tag
+{};
+struct reduce_init_op_interface_tag
+{};
 
 
 // synchronize based on a RAJA execution policy
-template < typename policy >
+template <typename policy>
 struct PolicySynchronize
 {
   void synchronize()
@@ -59,71 +65,76 @@ struct PolicySynchronize
 
 #if defined(RAJA_ENABLE_CUDA)
 // partial specialization for cuda_exec
-template < size_t BLOCK_SIZE, bool Async >
+template <size_t BLOCK_SIZE, bool Async>
 struct PolicySynchronize<RAJA::cuda_exec<BLOCK_SIZE, Async>>
 {
   void synchronize()
   {
-    if (Async) { RAJA::synchronize<RAJA::cuda_synchronize>(); }
+    if (Async)
+    {
+      RAJA::synchronize<RAJA::cuda_synchronize>();
+    }
   }
 };
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 // partial specialization for hip_exec
-template < size_t BLOCK_SIZE, bool Async >
+template <size_t BLOCK_SIZE, bool Async>
 struct PolicySynchronize<RAJA::hip_exec<BLOCK_SIZE, Async>>
 {
   void synchronize()
   {
-    if (Async) { RAJA::synchronize<RAJA::hip_synchronize>(); }
+    if (Async)
+    {
+      RAJA::synchronize<RAJA::hip_synchronize>();
+    }
   }
 };
 #endif
 
 
-template <typename Res,
-          typename interface_tag,
-          typename ValType>
+template <typename Res, typename interface_tag, typename ValType>
 struct ReduceData;
 
 template <typename Res, typename ValType>
 struct ReduceData<Res, reduce_interface_tag, ValType>
 {
-  ValType* values = nullptr;
+  ValType* values        = nullptr;
   ValType* reduced_value = nullptr;
   Res m_res;
 
-  template < typename RandomGenerator >
-  ReduceData(size_t N, Res res, RandomGenerator gen_random)
-    : m_res(res)
+  template <typename RandomGenerator>
+  ReduceData(size_t N, Res res, RandomGenerator gen_random) : m_res(res)
   {
-    if (N > 0) {
-      values = m_res.template allocate<ValType>(N, camp::resources::MemoryAccess::Managed);
+    if (N > 0)
+    {
+      values = m_res.template allocate<ValType>(
+          N, camp::resources::MemoryAccess::Managed);
     }
-    reduced_value = m_res.template allocate<ValType>(1, camp::resources::MemoryAccess::Managed);
+    reduced_value = m_res.template allocate<ValType>(
+        1, camp::resources::MemoryAccess::Managed);
 
-    for (size_t i = 0; i < N; i++) {
+    for (size_t i = 0; i < N; i++)
+    {
       values[i] = gen_random();
     }
   }
 
   void copy_data(size_t N)
   {
-    if ( N == 0 ) return;
+    if (N == 0) return;
   }
 
-  Res resource()
-  {
-    return m_res;
-  }
+  Res resource() { return m_res; }
 
-  ReduceData(ReduceData const&) = delete;
+  ReduceData(ReduceData const&)            = delete;
   ReduceData& operator=(ReduceData const&) = delete;
 
   ~ReduceData()
   {
-    if (values != nullptr) {
+    if (values != nullptr)
+    {
       m_res.deallocate(values, camp::resources::MemoryAccess::Managed);
       m_res.deallocate(reduced_value, camp::resources::MemoryAccess::Managed);
     }
@@ -131,15 +142,14 @@ struct ReduceData<Res, reduce_interface_tag, ValType>
 };
 
 
-template <typename Res,
-          typename T,
-          typename BinaryOp,
-          typename Reducer>
-void doReduce(ReduceData<Res, reduce_interface_tag, T> & data,
-            RAJA::Index_type N,
-            T,
-            BinaryOp,
-            Reducer reducer, reduce_interface_tag, reduce_default_interface_tag)
+template <typename Res, typename T, typename BinaryOp, typename Reducer>
+void doReduce(ReduceData<Res, reduce_interface_tag, T>& data,
+              RAJA::Index_type N,
+              T,
+              BinaryOp,
+              Reducer reducer,
+              reduce_interface_tag,
+              reduce_default_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -147,15 +157,14 @@ void doReduce(ReduceData<Res, reduce_interface_tag, T> & data,
   reducer.synchronize();
 }
 
-template <typename Res,
-          typename T,
-          typename BinaryOp,
-          typename Reducer>
-void doReduce(ReduceData<Res, reduce_interface_tag, T> & data,
-            RAJA::Index_type N,
-            T init,
-            BinaryOp,
-            Reducer reducer, reduce_interface_tag, reduce_init_interface_tag)
+template <typename Res, typename T, typename BinaryOp, typename Reducer>
+void doReduce(ReduceData<Res, reduce_interface_tag, T>& data,
+              RAJA::Index_type N,
+              T init,
+              BinaryOp,
+              Reducer reducer,
+              reduce_interface_tag,
+              reduce_init_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -163,15 +172,14 @@ void doReduce(ReduceData<Res, reduce_interface_tag, T> & data,
   reducer.synchronize();
 }
 
-template <typename Res,
-          typename T,
-          typename BinaryOp,
-          typename Reducer>
-void doReduce(ReduceData<Res, reduce_interface_tag, T> & data,
-            RAJA::Index_type N,
-            T init,
-            BinaryOp op,
-            Reducer reducer, reduce_interface_tag, reduce_init_op_interface_tag)
+template <typename Res, typename T, typename BinaryOp, typename Reducer>
+void doReduce(ReduceData<Res, reduce_interface_tag, T>& data,
+              RAJA::Index_type N,
+              T init,
+              BinaryOp op,
+              Reducer reducer,
+              reduce_interface_tag,
+              reduce_init_op_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -185,28 +193,33 @@ template <typename Res,
           typename BinaryOp,
           typename TestReducer,
           typename BinaryOpInterface>
-::testing::AssertionResult testReduce(
-    const char* test_name,
-    const unsigned seed,
-    ReduceData<Res, reduce_interface_tag, T> & data,
-    RAJA::Index_type N,
-    T init,
-    BinaryOp op,
-    TestReducer test_reducer, left_fold_reduce_tag, reduce_interface_tag si, BinaryOpInterface ci)
+::testing::AssertionResult
+testReduce(const char* test_name,
+           const unsigned seed,
+           ReduceData<Res, reduce_interface_tag, T>& data,
+           RAJA::Index_type N,
+           T init,
+           BinaryOp op,
+           TestReducer test_reducer,
+           left_fold_reduce_tag,
+           reduce_interface_tag si,
+           BinaryOpInterface ci)
 {
   doReduce(data, N, init, op, test_reducer, si, ci);
 
   T reduced_check_value = init;
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     reduced_check_value = op(std::move(reduced_check_value), data.values[i]);
   }
 
-  if (reduced_check_value != *data.reduced_value) {
+  if (reduced_check_value != *data.reduced_value)
+  {
     return ::testing::AssertionFailure()
            << test_reducer.name() << " (left fold reduce) " << test_name
            << " (with N " << N << " with seed " << seed << ")"
-           << " incorrect " << *data.reduced_value
-           << ", expected " << reduced_check_value;
+           << " incorrect " << *data.reduced_value << ", expected "
+           << reduced_check_value;
   }
 
   return ::testing::AssertionSuccess();
@@ -217,84 +230,98 @@ template <typename Res,
           typename BinaryOp,
           typename TestReducer,
           typename BinaryOpInterface>
-::testing::AssertionResult testReduce(
-    const char* test_name,
-    const unsigned seed,
-    ReduceData<Res, reduce_interface_tag, T> & data,
-    RAJA::Index_type N,
-    T init,
-    BinaryOp op,
-    TestReducer test_reducer, unordered_reduce_tag, reduce_interface_tag si, BinaryOpInterface ci)
+::testing::AssertionResult
+testReduce(const char* test_name,
+           const unsigned seed,
+           ReduceData<Res, reduce_interface_tag, T>& data,
+           RAJA::Index_type N,
+           T init,
+           BinaryOp op,
+           TestReducer test_reducer,
+           unordered_reduce_tag,
+           reduce_interface_tag si,
+           BinaryOpInterface ci)
 {
   doReduce(data, N, init, op, test_reducer, si, ci);
 
   T reduced_check_value = init;
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     reduced_check_value = op(std::move(reduced_check_value), data.values[i]);
   }
 
-  if (reduced_check_value != *data.reduced_value) {
+  if (reduced_check_value != *data.reduced_value)
+  {
     return ::testing::AssertionFailure()
            << test_reducer.name() << " (unordered reduce) " << test_name
            << " (with N " << N << " with seed " << seed << ")"
-           << " incorrect " << *data.reduced_value
-           << ", expected " << reduced_check_value;
+           << " incorrect " << *data.reduced_value << ", expected "
+           << reduced_check_value;
   }
 
   return ::testing::AssertionSuccess();
 }
 
 
-template <typename ValType,
-          typename Reducer,
-          typename Res>
-void testReducerInterfaces(unsigned seed, RAJA::Index_type MaxN, Reducer reducer, Res res)
+template <typename ValType, typename Reducer, typename Res>
+void testReducerInterfaces(unsigned seed,
+                           RAJA::Index_type MaxN,
+                           Reducer reducer,
+                           Res res)
 {
-  using reduce_category    = typename Reducer::reduce_category ;
-  using interface_category = typename Reducer::reduce_interface ;
+  using reduce_category    = typename Reducer::reduce_category;
+  using interface_category = typename Reducer::reduce_interface;
   using no_init_operator   = reduce_default_interface_tag;
   using init_no_operator   = reduce_init_interface_tag;
   using init_operator      = reduce_init_op_interface_tag;
 
   std::mt19937 rng(seed);
-  RAJA::Index_type N = std::uniform_int_distribution<RAJA::Index_type>((MaxN+1)/2, MaxN)(rng);
+  RAJA::Index_type N = std::uniform_int_distribution<RAJA::Index_type>(
+      (MaxN + 1) / 2, MaxN)(rng);
   std::uniform_int_distribution<RAJA::Index_type> dist(-N, N);
 
-  ReduceData<Res, interface_category, ValType> data(N, res, [&](){ return dist(rng); });
-
-  ASSERT_TRUE(testReduce("default", seed, data, N, RAJA::operators::plus<ValType>::identity(), RAJA::operators::plus<ValType>{},
-      reducer, reduce_category{}, interface_category{}, no_init_operator{}));
-  ASSERT_TRUE(testReduce("init", seed, data, N, ValType(N), RAJA::operators::plus<ValType>{},
-      reducer, reduce_category{}, interface_category{}, init_no_operator{}));
-  ASSERT_TRUE(testReduce("minimum", seed, data, N, ValType(0), RAJA::operators::minimum<ValType>{},
-      reducer, reduce_category{}, interface_category{}, init_operator{}));
-  ASSERT_TRUE(testReduce("Maximum", seed, data, N, ValType(0), RAJA::operators::maximum<ValType>{},
-      reducer, reduce_category{}, interface_category{}, init_operator{}));
+  ReduceData<Res, interface_category, ValType> data(
+      N, res, [&]() { return dist(rng); });
+
+  ASSERT_TRUE(testReduce(
+      "default", seed, data, N, RAJA::operators::plus<ValType>::identity(),
+      RAJA::operators::plus<ValType> {}, reducer, reduce_category {},
+      interface_category {}, no_init_operator {}));
+  ASSERT_TRUE(testReduce(
+      "init", seed, data, N, ValType(N), RAJA::operators::plus<ValType> {},
+      reducer, reduce_category {}, interface_category {}, init_no_operator {}));
+  ASSERT_TRUE(testReduce("minimum", seed, data, N, ValType(0),
+                         RAJA::operators::minimum<ValType> {}, reducer,
+                         reduce_category {}, interface_category {},
+                         init_operator {}));
+  ASSERT_TRUE(testReduce("Maximum", seed, data, N, ValType(0),
+                         RAJA::operators::maximum<ValType> {}, reducer,
+                         reduce_category {}, interface_category {},
+                         init_operator {}));
 }
 
-template <typename ValType,
-          typename Reducer,
-          typename Res>
+template <typename ValType, typename Reducer, typename Res>
 void testReducer(unsigned seed, RAJA::Index_type MaxN, Reducer reducer, Res res)
 {
   testReducerInterfaces<ValType>(seed, 0, reducer, res);
-  for (RAJA::Index_type n = 1; n <= MaxN; n *= 10) {
+  for (RAJA::Index_type n = 1; n <= MaxN; n *= 10)
+  {
     testReducerInterfaces<ValType>(seed, n, reducer, res);
   }
 }
 
 inline unsigned get_random_seed()
 {
-  static unsigned seed = std::random_device{}();
+  static unsigned seed = std::random_device {}();
   return seed;
 }
 
 
 TYPED_TEST_SUITE_P(ReduceUnitTest);
 
-template < typename T >
+template <typename T>
 class ReduceUnitTest : public ::testing::Test
-{ };
+{};
 
 TYPED_TEST_P(ReduceUnitTest, UnitReduce)
 {
@@ -303,9 +330,9 @@ TYPED_TEST_P(ReduceUnitTest, UnitReduce)
   using ValType  = typename camp::at<TypeParam, camp::num<2>>::type;
   using MaxNType = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  unsigned seed = get_random_seed();
+  unsigned seed         = get_random_seed();
   RAJA::Index_type MaxN = MaxNType::value;
-  Reducer reducer{};
+  Reducer reducer {};
   ResType res = ResType::get_default();
 
   testReducer<ValType>(seed, MaxN, reducer, res);
@@ -317,34 +344,21 @@ REGISTER_TYPED_TEST_SUITE_P(ReduceUnitTest, UnitReduce);
 //
 // Key types for reduce tests
 //
-using ReduceValTypeList =
-  camp::list<
-              RAJA::Index_type,
-              int,
+using ReduceValTypeList = camp::list<RAJA::Index_type,
+                                     int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              unsigned,
-              long long,
-              unsigned long long,
-              float,
+                                     unsigned,
+                                     long long,
+                                     unsigned long long,
+                                     float,
 #endif
-              double
-            >;
+                                     double>;
 
 // Max test lengths for reduce tests
-using ReduceMaxNListDefault =
-  camp::list<
-              camp::num<10000>
-            >;
-
-using ReduceMaxNListSmall =
-  camp::list<
-              camp::num<1000>
-            >;
+using ReduceMaxNListDefault = camp::list<camp::num<10000>>;
 
-using ReduceMaxNListTiny =
-  camp::list<
-              camp::num<100>
-            >;
+using ReduceMaxNListSmall = camp::list<camp::num<1000>>;
 
-#endif //__TEST_ALGORITHM_REDUCE_UTILS_HPP__
+using ReduceMaxNListTiny = camp::list<camp::num<100>>;
 
+#endif  //__TEST_ALGORITHM_REDUCE_UTILS_HPP__
diff --git a/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp b/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp
index 4f3f5b4d64..046d631adf 100644
--- a/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp
@@ -37,20 +37,28 @@
 
 
 // tag classes to differentiate sort by attributes and apply correct testing
-struct unstable_sort_tag { };
-struct stable_sort_tag { };
-
-struct sort_interface_tag { };
-struct sort_pairs_interface_tag { };
-
-struct sort_default_interface_tag { };
-struct sort_comp_interface_tag { };
-struct sort_res_default_interface_tag { };
-struct sort_res_comp_interface_tag { };
+struct unstable_sort_tag
+{};
+struct stable_sort_tag
+{};
+
+struct sort_interface_tag
+{};
+struct sort_pairs_interface_tag
+{};
+
+struct sort_default_interface_tag
+{};
+struct sort_comp_interface_tag
+{};
+struct sort_res_default_interface_tag
+{};
+struct sort_res_comp_interface_tag
+{};
 
 
 // synchronize based on a RAJA execution policy
-template < typename policy >
+template <typename policy>
 struct PolicySynchronize
 {
   void synchronize()
@@ -61,24 +69,30 @@ struct PolicySynchronize
 
 #if defined(RAJA_ENABLE_CUDA)
 // partial specialization for cuda_exec
-template < size_t BLOCK_SIZE, bool Async >
+template <size_t BLOCK_SIZE, bool Async>
 struct PolicySynchronize<RAJA::cuda_exec<BLOCK_SIZE, Async>>
 {
   void synchronize()
   {
-    if (Async) { RAJA::synchronize<RAJA::cuda_synchronize>(); }
+    if (Async)
+    {
+      RAJA::synchronize<RAJA::cuda_synchronize>();
+    }
   }
 };
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 // partial specialization for hip_exec
-template < size_t BLOCK_SIZE, bool Async >
+template <size_t BLOCK_SIZE, bool Async>
 struct PolicySynchronize<RAJA::hip_exec<BLOCK_SIZE, Async>>
 {
   void synchronize()
   {
-    if (Async) { RAJA::synchronize<RAJA::hip_synchronize>(); }
+    if (Async)
+    {
+      RAJA::synchronize<RAJA::hip_synchronize>();
+    }
   }
 };
 #endif
@@ -93,41 +107,42 @@ struct SortData;
 template <typename Res, typename K, typename V>
 struct SortData<Res, sort_interface_tag, K, V>
 {
-  K* orig_keys = nullptr;
+  K* orig_keys   = nullptr;
   K* sorted_keys = nullptr;
   Res m_res;
 
-  template < typename RandomGenerator >
-  SortData(size_t N, Res res, RandomGenerator gen_random)
-    : m_res(res)
+  template <typename RandomGenerator>
+  SortData(size_t N, Res res, RandomGenerator gen_random) : m_res(res)
   {
-    if (N > 0) {
-      orig_keys = m_res.template allocate<K>(N, camp::resources::MemoryAccess::Managed);
-      sorted_keys = m_res.template allocate<K>(N, camp::resources::MemoryAccess::Managed);
+    if (N > 0)
+    {
+      orig_keys =
+          m_res.template allocate<K>(N, camp::resources::MemoryAccess::Managed);
+      sorted_keys =
+          m_res.template allocate<K>(N, camp::resources::MemoryAccess::Managed);
     }
 
-    for (size_t i = 0; i < N; i++) {
+    for (size_t i = 0; i < N; i++)
+    {
       orig_keys[i] = gen_random();
     }
   }
 
   void copy_data(size_t N)
   {
-    if ( N == 0 ) return;
-    m_res.memcpy(sorted_keys, orig_keys, N*sizeof(K));
+    if (N == 0) return;
+    m_res.memcpy(sorted_keys, orig_keys, N * sizeof(K));
   }
 
-  Res resource()
-  {
-    return m_res;
-  }
+  Res resource() { return m_res; }
 
-  SortData(SortData const&) = delete;
+  SortData(SortData const&)            = delete;
   SortData& operator=(SortData const&) = delete;
 
   ~SortData()
   {
-    if (orig_keys != nullptr) {
+    if (orig_keys != nullptr)
+    {
       m_res.deallocate(orig_keys, camp::resources::MemoryAccess::Managed);
       m_res.deallocate(sorted_keys, camp::resources::MemoryAccess::Managed);
     }
@@ -136,23 +151,28 @@ struct SortData<Res, sort_interface_tag, K, V>
 
 
 template <typename Res, typename K, typename V>
-struct SortData<Res, sort_pairs_interface_tag, K, V> : SortData<Res, sort_interface_tag, K, V>
+struct SortData<Res, sort_pairs_interface_tag, K, V>
+    : SortData<Res, sort_interface_tag, K, V>
 {
   using base = SortData<Res, sort_interface_tag, K, V>;
 
-  V* orig_vals = nullptr;
+  V* orig_vals   = nullptr;
   V* sorted_vals = nullptr;
 
-  template < typename RandomGenerator >
+  template <typename RandomGenerator>
   SortData(size_t N, Res res, RandomGenerator gen_random)
-    : base(N, res, gen_random)
+      : base(N, res, gen_random)
   {
-    if (N > 0) {
-      orig_vals = this->m_res.template allocate<V>(N, camp::resources::MemoryAccess::Managed);
-      sorted_vals = this->m_res.template allocate<V>(N, camp::resources::MemoryAccess::Managed);
+    if (N > 0)
+    {
+      orig_vals = this->m_res.template allocate<V>(
+          N, camp::resources::MemoryAccess::Managed);
+      sorted_vals = this->m_res.template allocate<V>(
+          N, camp::resources::MemoryAccess::Managed);
     }
 
-    for (size_t i = 0; i < N; i++) {
+    for (size_t i = 0; i < N; i++)
+    {
       orig_vals[i] = gen_random();
     }
   }
@@ -160,31 +180,32 @@ struct SortData<Res, sort_pairs_interface_tag, K, V> : SortData<Res, sort_interf
   void copy_data(size_t N)
   {
     base::copy_data(N);
-    if ( N == 0 ) return;
-    this->m_res.memcpy(sorted_vals, orig_vals, N*sizeof(V));
+    if (N == 0) return;
+    this->m_res.memcpy(sorted_vals, orig_vals, N * sizeof(V));
   }
 
-  SortData(SortData const&) = delete;
+  SortData(SortData const&)            = delete;
   SortData& operator=(SortData const&) = delete;
 
   ~SortData()
   {
-    if (orig_vals != nullptr) {
+    if (orig_vals != nullptr)
+    {
       this->m_res.deallocate(orig_vals, camp::resources::MemoryAccess::Managed);
-      this->m_res.deallocate(sorted_vals, camp::resources::MemoryAccess::Managed);
+      this->m_res.deallocate(sorted_vals,
+                             camp::resources::MemoryAccess::Managed);
     }
   }
 };
 
 
-template <typename Res,
-          typename T,
-          typename Compare,
-          typename Sorter>
-void doSort(SortData<Res, sort_interface_tag, T> & data,
+template <typename Res, typename T, typename Compare, typename Sorter>
+void doSort(SortData<Res, sort_interface_tag, T>& data,
             RAJA::Index_type N,
             Compare,
-            Sorter sorter, sort_interface_tag, sort_default_interface_tag)
+            Sorter sorter,
+            sort_interface_tag,
+            sort_default_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -192,50 +213,43 @@ void doSort(SortData<Res, sort_interface_tag, T> & data,
   sorter.synchronize();
 }
 
-template <typename Res,
-          typename T,
-          typename Compare,
-          typename Sorter>
-void doSort(SortData<Res, sort_interface_tag, T> & data,
+template <typename Res, typename T, typename Compare, typename Sorter>
+void doSort(SortData<Res, sort_interface_tag, T>& data,
             RAJA::Index_type N,
             Compare comp,
-            Sorter sorter, sort_interface_tag, sort_comp_interface_tag)
+            Sorter sorter,
+            sort_interface_tag,
+            sort_comp_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
-  sorter(RAJA::make_span(data.sorted_keys, N),
-         comp);
+  sorter(RAJA::make_span(data.sorted_keys, N), comp);
   sorter.synchronize();
 }
 
-template <typename Res,
-          typename T,
-          typename Compare,
-          typename Sorter>
-void doSort(SortData<Res, sort_interface_tag, T> & data,
+template <typename Res, typename T, typename Compare, typename Sorter>
+void doSort(SortData<Res, sort_interface_tag, T>& data,
             RAJA::Index_type N,
             Compare,
-            Sorter sorter, sort_interface_tag, sort_res_default_interface_tag)
+            Sorter sorter,
+            sort_interface_tag,
+            sort_res_default_interface_tag)
 {
   data.copy_data(N);
-  sorter(data.resource(),
-         RAJA::make_span(data.sorted_keys, N));
+  sorter(data.resource(), RAJA::make_span(data.sorted_keys, N));
   data.resource().wait();
 }
 
-template <typename Res,
-          typename T,
-          typename Compare,
-          typename Sorter>
-void doSort(SortData<Res, sort_interface_tag, T> & data,
+template <typename Res, typename T, typename Compare, typename Sorter>
+void doSort(SortData<Res, sort_interface_tag, T>& data,
             RAJA::Index_type N,
             Compare comp,
-            Sorter sorter, sort_interface_tag, sort_res_comp_interface_tag)
+            Sorter sorter,
+            sort_interface_tag,
+            sort_res_comp_interface_tag)
 {
   data.copy_data(N);
-  sorter(data.resource(),
-         RAJA::make_span(data.sorted_keys, N),
-         comp);
+  sorter(data.resource(), RAJA::make_span(data.sorted_keys, N), comp);
   data.resource().wait();
 }
 
@@ -244,10 +258,12 @@ template <typename Res,
           typename V,
           typename Compare,
           typename Sorter>
-void doSort(SortData<Res, sort_pairs_interface_tag, K, V> & data,
+void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
             RAJA::Index_type N,
             Compare,
-            Sorter sorter, sort_pairs_interface_tag, sort_default_interface_tag)
+            Sorter sorter,
+            sort_pairs_interface_tag,
+            sort_default_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -261,16 +277,17 @@ template <typename Res,
           typename V,
           typename Compare,
           typename Sorter>
-void doSort(SortData<Res, sort_pairs_interface_tag, K, V> & data,
+void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
             RAJA::Index_type N,
             Compare comp,
-            Sorter sorter, sort_pairs_interface_tag, sort_comp_interface_tag)
+            Sorter sorter,
+            sort_pairs_interface_tag,
+            sort_comp_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
   sorter(RAJA::make_span(data.sorted_keys, N),
-         RAJA::make_span(data.sorted_vals, N),
-         comp);
+         RAJA::make_span(data.sorted_vals, N), comp);
   sorter.synchronize();
 }
 
@@ -279,14 +296,15 @@ template <typename Res,
           typename V,
           typename Compare,
           typename Sorter>
-void doSort(SortData<Res, sort_pairs_interface_tag, K, V> & data,
+void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
             RAJA::Index_type N,
             Compare,
-            Sorter sorter, sort_pairs_interface_tag, sort_res_default_interface_tag)
+            Sorter sorter,
+            sort_pairs_interface_tag,
+            sort_res_default_interface_tag)
 {
   data.copy_data(N);
-  sorter(data.resource(),
-         RAJA::make_span(data.sorted_keys, N),
+  sorter(data.resource(), RAJA::make_span(data.sorted_keys, N),
          RAJA::make_span(data.sorted_vals, N));
   data.resource().wait();
 }
@@ -296,16 +314,16 @@ template <typename Res,
           typename V,
           typename Compare,
           typename Sorter>
-void doSort(SortData<Res, sort_pairs_interface_tag, K, V> & data,
+void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
             RAJA::Index_type N,
             Compare comp,
-            Sorter sorter, sort_pairs_interface_tag, sort_res_comp_interface_tag)
+            Sorter sorter,
+            sort_pairs_interface_tag,
+            sort_res_comp_interface_tag)
 {
   data.copy_data(N);
-  sorter(data.resource(),
-         RAJA::make_span(data.sorted_keys, N),
-         RAJA::make_span(data.sorted_vals, N),
-         comp);
+  sorter(data.resource(), RAJA::make_span(data.sorted_keys, N),
+         RAJA::make_span(data.sorted_vals, N), comp);
   data.resource().wait();
 }
 
@@ -315,57 +333,60 @@ template <typename Res,
           typename Compare,
           typename TestSorter,
           typename CompareInterface>
-::testing::AssertionResult testSort(
-    const char* test_name,
-    const unsigned seed,
-    SortData<Res, sort_interface_tag, T> & data,
-    RAJA::Index_type N,
-    Compare comp,
-    TestSorter test_sorter, unstable_sort_tag, sort_interface_tag si, CompareInterface ci)
+::testing::AssertionResult testSort(const char* test_name,
+                                    const unsigned seed,
+                                    SortData<Res, sort_interface_tag, T>& data,
+                                    RAJA::Index_type N,
+                                    Compare comp,
+                                    TestSorter test_sorter,
+                                    unstable_sort_tag,
+                                    sort_interface_tag si,
+                                    CompareInterface ci)
 {
   doSort(data, N, comp, test_sorter, si, ci);
 
   // make map of keys to keys
   using val_map = std::unordered_multiset<T>;
   std::unordered_map<T, val_map> keys;
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     auto key_iter = keys.find(data.orig_keys[i]);
-    if (key_iter == keys.end()) {
-      auto ret = keys.emplace(data.orig_keys[i], val_map{});
+    if (key_iter == keys.end())
+    {
+      auto ret = keys.emplace(data.orig_keys[i], val_map {});
       assert(ret.second);
       key_iter = ret.first;
     }
     key_iter->second.emplace(data.orig_keys[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     // test ordering
-    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i-1]))
+    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i - 1]))
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (unstable sort) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
-             << " out of order "
-             << data.sorted_keys[i-1] << ", " << data.sorted_keys[i]
-             << " (at index " << i-1 << ")";
+             << " out of order " << data.sorted_keys[i - 1] << ", "
+             << data.sorted_keys[i] << " (at index " << i - 1 << ")";
     // test there is an item with this
     auto key_iter = keys.find(data.sorted_keys[i]);
     if (key_iter == keys.end())
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (unstable sort) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
-             << " unknown or duplicate key "
-             << data.sorted_keys[i]
+             << " unknown or duplicate key " << data.sorted_keys[i]
              << " (at index " << i << ")";
     auto val_iter = key_iter->second.find(data.sorted_keys[i]);
     if (val_iter == key_iter->second.end())
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (unstable sort) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
-             << " unknown or duplicate val "
-             << data.sorted_keys[i]
+             << " unknown or duplicate val " << data.sorted_keys[i]
              << " (at index " << i << ")";
     key_iter->second.erase(val_iter);
-    if (key_iter->second.size() == 0) {
+    if (key_iter->second.size() == 0)
+    {
       keys.erase(key_iter);
     }
   }
@@ -377,56 +398,59 @@ template <typename Res,
           typename Compare,
           typename TestSorter,
           typename CompareInterface>
-::testing::AssertionResult testSort(
-    const char* test_name,
-    const unsigned seed,
-    SortData<Res, sort_interface_tag, T> & data,
-    RAJA::Index_type N,
-    Compare comp,
-    TestSorter test_sorter, stable_sort_tag, sort_interface_tag si, CompareInterface ci)
+::testing::AssertionResult testSort(const char* test_name,
+                                    const unsigned seed,
+                                    SortData<Res, sort_interface_tag, T>& data,
+                                    RAJA::Index_type N,
+                                    Compare comp,
+                                    TestSorter test_sorter,
+                                    stable_sort_tag,
+                                    sort_interface_tag si,
+                                    CompareInterface ci)
 {
   doSort(data, N, comp, test_sorter, si, ci);
 
   // make map of keys to keys
   using val_map = std::list<T>;
   std::unordered_map<T, val_map> keys;
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     auto key_iter = keys.find(data.orig_keys[i]);
-    if (key_iter == keys.end()) {
-      auto ret = keys.emplace(data.orig_keys[i], val_map{});
+    if (key_iter == keys.end())
+    {
+      auto ret = keys.emplace(data.orig_keys[i], val_map {});
       assert(ret.second);
       key_iter = ret.first;
     }
     key_iter->second.emplace_back(data.orig_keys[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     // test ordering
-    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i-1]))
+    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i - 1]))
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (stable sort) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
-             << " out of order "
-             << data.sorted_keys[i-1] << ", " << data.sorted_keys[i]
-             << " (at index " << i-1 << ")";
+             << " out of order " << data.sorted_keys[i - 1] << ", "
+             << data.sorted_keys[i] << " (at index " << i - 1 << ")";
     // test there is an item with this
     auto key_iter = keys.find(data.sorted_keys[i]);
     if (key_iter == keys.end())
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (stable sort) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
-             << " unknown or duplicate key "
-             << data.sorted_keys[i]
+             << " unknown or duplicate key " << data.sorted_keys[i]
              << " (at index " << i << ")";
     if (key_iter->second.front() != data.sorted_keys[i])
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (stable sort) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
-             << " out of stable order or unknown val "
-             << data.sorted_keys[i]
+             << " out of stable order or unknown val " << data.sorted_keys[i]
              << " (at index " << i << ")";
     key_iter->second.pop_front();
-    if (key_iter->second.size() == 0) {
+    if (key_iter->second.size() == 0)
+    {
       keys.erase(key_iter);
     }
   }
@@ -440,38 +464,45 @@ template <typename Res,
           typename Compare,
           typename TestSorter,
           typename CompareInterface>
-::testing::AssertionResult testSort(
-    const char* test_name,
-    const unsigned seed,
-    SortData<Res, sort_pairs_interface_tag, K, V> & data,
-    RAJA::Index_type N,
-    Compare comp,
-    TestSorter test_sorter, unstable_sort_tag, sort_pairs_interface_tag si, CompareInterface ci)
+::testing::AssertionResult
+testSort(const char* test_name,
+         const unsigned seed,
+         SortData<Res, sort_pairs_interface_tag, K, V>& data,
+         RAJA::Index_type N,
+         Compare comp,
+         TestSorter test_sorter,
+         unstable_sort_tag,
+         sort_pairs_interface_tag si,
+         CompareInterface ci)
 {
   doSort(data, N, comp, test_sorter, si, ci);
 
   // make map of keys to vals
   using val_map = std::unordered_multiset<V>;
   std::unordered_map<K, val_map> keys_to_vals;
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     auto key_iter = keys_to_vals.find(data.orig_keys[i]);
-    if (key_iter == keys_to_vals.end()) {
-      auto ret = keys_to_vals.emplace(data.orig_keys[i], val_map{});
+    if (key_iter == keys_to_vals.end())
+    {
+      auto ret = keys_to_vals.emplace(data.orig_keys[i], val_map {});
       assert(ret.second);
       key_iter = ret.first;
     }
     key_iter->second.emplace(data.orig_vals[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     // test ordering
-    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i-1]))
+    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i - 1]))
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (unstable sort pairs) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
-             << " keys " << data.sorted_keys[i-1] << ", " << data.sorted_keys[i] << " out of order"
-             << " vals " << data.sorted_vals[i-1] << ", " << data.sorted_vals[i]
-             << " (at index " << i-1 << ")";
+             << " keys " << data.sorted_keys[i - 1] << ", "
+             << data.sorted_keys[i] << " out of order"
+             << " vals " << data.sorted_vals[i - 1] << ", "
+             << data.sorted_vals[i] << " (at index " << i - 1 << ")";
     // test there is a pair with this key and val
     auto key_iter = keys_to_vals.find(data.sorted_keys[i]);
     if (key_iter == keys_to_vals.end())
@@ -479,8 +510,7 @@ ::testing::AssertionResult testSort(
              << test_sorter.name() << " (unstable sort pairs) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
              << " unknown or duplicate key "
-             << " key " << data.sorted_keys[i]
-             << " val " << data.sorted_vals[i]
+             << " key " << data.sorted_keys[i] << " val " << data.sorted_vals[i]
              << " (at index " << i << ")";
     auto val_iter = key_iter->second.find(data.sorted_vals[i]);
     if (val_iter == key_iter->second.end())
@@ -488,11 +518,11 @@ ::testing::AssertionResult testSort(
              << test_sorter.name() << " (unstable sort pairs) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
              << " unknown or duplicate val "
-             << " key " << data.sorted_keys[i]
-             << " val " << data.sorted_vals[i]
+             << " key " << data.sorted_keys[i] << " val " << data.sorted_vals[i]
              << " (at index " << i << ")";
     key_iter->second.erase(val_iter);
-    if (key_iter->second.size() == 0) {
+    if (key_iter->second.size() == 0)
+    {
       keys_to_vals.erase(key_iter);
     }
   }
@@ -505,39 +535,45 @@ template <typename Res,
           typename Compare,
           typename TestSorter,
           typename CompareInterface>
-::testing::AssertionResult testSort(
-    const char* test_name,
-    const unsigned seed,
-    SortData<Res, sort_pairs_interface_tag, K, V> & data,
-    RAJA::Index_type N,
-    Compare comp,
-    TestSorter test_sorter, stable_sort_tag, sort_pairs_interface_tag si, CompareInterface ci)
+::testing::AssertionResult
+testSort(const char* test_name,
+         const unsigned seed,
+         SortData<Res, sort_pairs_interface_tag, K, V>& data,
+         RAJA::Index_type N,
+         Compare comp,
+         TestSorter test_sorter,
+         stable_sort_tag,
+         sort_pairs_interface_tag si,
+         CompareInterface ci)
 {
   doSort(data, N, comp, test_sorter, si, ci);
 
   // make map of keys to vals
   using val_map = std::list<V>;
   std::unordered_map<K, val_map> keys_to_vals;
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     auto key_iter = keys_to_vals.find(data.orig_keys[i]);
-    if (key_iter == keys_to_vals.end()) {
-      auto ret = keys_to_vals.emplace(data.orig_keys[i], val_map{});
+    if (key_iter == keys_to_vals.end())
+    {
+      auto ret = keys_to_vals.emplace(data.orig_keys[i], val_map {});
       assert(ret.second);
       key_iter = ret.first;
     }
     key_iter->second.emplace_back(data.orig_vals[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     // test ordering
-    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i-1]))
+    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i - 1]))
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (stable sort pairs) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
              << " out of order "
-             << " keys " << data.sorted_keys[i-1] << ", " << data.sorted_keys[i]
-             << " vals " << data.sorted_vals[i-1] << ", " << data.sorted_vals[i]
-             << " (at index " << i-1 << ")";
+             << " keys " << data.sorted_keys[i - 1] << ", "
+             << data.sorted_keys[i] << " vals " << data.sorted_vals[i - 1]
+             << ", " << data.sorted_vals[i] << " (at index " << i - 1 << ")";
     // test there is a pair with this key and val
     auto key_iter = keys_to_vals.find(data.sorted_keys[i]);
     if (key_iter == keys_to_vals.end())
@@ -545,19 +581,18 @@ ::testing::AssertionResult testSort(
              << test_sorter.name() << " (stable sort pairs) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
              << " unknown or duplicate key "
-             << " key " << data.sorted_keys[i]
-             << " val " << data.sorted_vals[i]
+             << " key " << data.sorted_keys[i] << " val " << data.sorted_vals[i]
              << " (at index " << i << ")";
     if (key_iter->second.front() != data.sorted_vals[i])
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (stable sort pairs) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
              << " out of stable order or unknown val "
-             << " key " << data.sorted_keys[i]
-             << " val " << data.sorted_vals[i]
+             << " key " << data.sorted_keys[i] << " val " << data.sorted_vals[i]
              << " (at index " << i << ")";
     key_iter->second.pop_front();
-    if (key_iter->second.size() == 0) {
+    if (key_iter->second.size() == 0)
+    {
       keys_to_vals.erase(key_iter);
     }
   }
@@ -565,95 +600,97 @@ ::testing::AssertionResult testSort(
 }
 
 
-template <typename Res,
-          typename K,
-          typename V,
-          typename Sorter>
+template <typename Res, typename K, typename V, typename Sorter>
 void testSorterResInterfaces(
     std::false_type,
     unsigned,
-    SortData<Res, typename Sorter::sort_interface, K, V> &,
+    SortData<Res, typename Sorter::sort_interface, K, V>&,
     RAJA::Index_type,
     Sorter)
 {
   // Sorter does not support resource interface, no tests
 }
 
-template <typename Res,
-          typename K,
-          typename V,
-          typename Sorter>
+template <typename Res, typename K, typename V, typename Sorter>
 void testSorterResInterfaces(
     std::true_type,
     unsigned seed,
-    SortData<Res, typename Sorter::sort_interface, K, V> & data,
+    SortData<Res, typename Sorter::sort_interface, K, V>& data,
     RAJA::Index_type N,
     Sorter sorter)
 {
   // Sorter supports resource interface, res tests
-  using stability_category = typename Sorter::sort_category ;
-  using pairs_category     = typename Sorter::sort_interface ;
+  using stability_category      = typename Sorter::sort_category;
+  using pairs_category          = typename Sorter::sort_interface;
   using resource_no_comparator  = sort_res_default_interface_tag;
   using resource_use_comparator = sort_res_comp_interface_tag;
 
-  ASSERT_TRUE(testSort("resource+default", seed, data, N, RAJA::operators::less<K>{},
-      sorter, stability_category{}, pairs_category{}, resource_no_comparator{}));
-  ASSERT_TRUE(testSort("resource+ascending", seed, data, N, RAJA::operators::less<K>{},
-      sorter, stability_category{}, pairs_category{}, resource_use_comparator{}));
-  ASSERT_TRUE(testSort("resource+descending", seed, data, N, RAJA::operators::greater<K>{},
-      sorter, stability_category{}, pairs_category{}, resource_use_comparator{}));
+  ASSERT_TRUE(testSort(
+      "resource+default", seed, data, N, RAJA::operators::less<K> {}, sorter,
+      stability_category {}, pairs_category {}, resource_no_comparator {}));
+  ASSERT_TRUE(testSort(
+      "resource+ascending", seed, data, N, RAJA::operators::less<K> {}, sorter,
+      stability_category {}, pairs_category {}, resource_use_comparator {}));
+  ASSERT_TRUE(testSort("resource+descending", seed, data, N,
+                       RAJA::operators::greater<K> {}, sorter,
+                       stability_category {}, pairs_category {},
+                       resource_use_comparator {}));
 }
 
-template <typename K,
-          typename Sorter,
-          typename Res>
-void testSorterInterfaces(unsigned seed, RAJA::Index_type MaxN, Sorter sorter, Res res)
+template <typename K, typename Sorter, typename Res>
+void testSorterInterfaces(unsigned seed,
+                          RAJA::Index_type MaxN,
+                          Sorter sorter,
+                          Res res)
 {
-  using stability_category = typename Sorter::sort_category ;
-  using pairs_category     = typename Sorter::sort_interface ;
-  using supports_resource  = typename Sorter::supports_resource ;
+  using stability_category = typename Sorter::sort_category;
+  using pairs_category     = typename Sorter::sort_interface;
+  using supports_resource  = typename Sorter::supports_resource;
   using no_comparator      = sort_default_interface_tag;
   using use_comparator     = sort_comp_interface_tag;
 
   std::mt19937 rng(seed);
-  RAJA::Index_type N = std::uniform_int_distribution<RAJA::Index_type>((MaxN+1)/2, MaxN)(rng);
+  RAJA::Index_type N = std::uniform_int_distribution<RAJA::Index_type>(
+      (MaxN + 1) / 2, MaxN)(rng);
   std::uniform_int_distribution<RAJA::Index_type> dist(-N, N);
 
-  SortData<Res, pairs_category, K> data(N, res, [&](){ return dist(rng); });
+  SortData<Res, pairs_category, K> data(N, res, [&]() { return dist(rng); });
 
-  ASSERT_TRUE(testSort("default", seed, data, N, RAJA::operators::less<K>{},
-      sorter, stability_category{}, pairs_category{}, no_comparator{}));
-  ASSERT_TRUE(testSort("ascending", seed, data, N, RAJA::operators::less<K>{},
-      sorter, stability_category{}, pairs_category{}, use_comparator{}));
-  ASSERT_TRUE(testSort("descending", seed, data, N, RAJA::operators::greater<K>{},
-      sorter, stability_category{}, pairs_category{}, use_comparator{}));
+  ASSERT_TRUE(testSort("default", seed, data, N, RAJA::operators::less<K> {},
+                       sorter, stability_category {}, pairs_category {},
+                       no_comparator {}));
+  ASSERT_TRUE(testSort("ascending", seed, data, N, RAJA::operators::less<K> {},
+                       sorter, stability_category {}, pairs_category {},
+                       use_comparator {}));
+  ASSERT_TRUE(testSort(
+      "descending", seed, data, N, RAJA::operators::greater<K> {}, sorter,
+      stability_category {}, pairs_category {}, use_comparator {}));
 
   testSorterResInterfaces(supports_resource(), seed, data, N, sorter);
 }
 
-template <typename K,
-          typename Sorter,
-          typename Res>
+template <typename K, typename Sorter, typename Res>
 void testSorter(unsigned seed, RAJA::Index_type MaxN, Sorter sorter, Res res)
 {
   testSorterInterfaces<K>(seed, 0, sorter, res);
-  for (RAJA::Index_type n = 1; n <= MaxN; n *= 10) {
+  for (RAJA::Index_type n = 1; n <= MaxN; n *= 10)
+  {
     testSorterInterfaces<K>(seed, n, sorter, res);
   }
 }
 
 inline unsigned get_random_seed()
 {
-  static unsigned seed = std::random_device{}();
+  static unsigned seed = std::random_device {}();
   return seed;
 }
 
 
 TYPED_TEST_SUITE_P(SortUnitTest);
 
-template < typename T >
+template <typename T>
 class SortUnitTest : public ::testing::Test
-{ };
+{};
 
 TYPED_TEST_P(SortUnitTest, UnitSort)
 {
@@ -662,9 +699,9 @@ TYPED_TEST_P(SortUnitTest, UnitSort)
   using KeyType  = typename camp::at<TypeParam, camp::num<2>>::type;
   using MaxNType = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  unsigned seed = get_random_seed();
+  unsigned seed         = get_random_seed();
   RAJA::Index_type MaxN = MaxNType::value;
-  Sorter sorter{};
+  Sorter sorter {};
   ResType res = ResType::get_default();
 
   testSorter<KeyType>(seed, MaxN, sorter, res);
@@ -676,34 +713,21 @@ REGISTER_TYPED_TEST_SUITE_P(SortUnitTest, UnitSort);
 //
 // Key types for sort tests
 //
-using SortKeyTypeList =
-  camp::list<
-              RAJA::Index_type,
-              int,
+using SortKeyTypeList = camp::list<RAJA::Index_type,
+                                   int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              unsigned,
-              long long,
-              unsigned long long,
-              float,
+                                   unsigned,
+                                   long long,
+                                   unsigned long long,
+                                   float,
 #endif
-              double
-            >;
+                                   double>;
 
 // Max test lengths for sort tests
-using SortMaxNListDefault =
-  camp::list<
-              camp::num<10000>
-            >;
-
-using SortMaxNListSmall =
-  camp::list<
-              camp::num<1000>
-            >;
+using SortMaxNListDefault = camp::list<camp::num<10000>>;
 
-using SortMaxNListTiny =
-  camp::list<
-              camp::num<100>
-            >;
+using SortMaxNListSmall = camp::list<camp::num<1000>>;
 
-#endif //__TEST_ALGORITHM_SORT_UTILS_HPP__
+using SortMaxNListTiny = camp::list<camp::num<100>>;
 
+#endif  //__TEST_ALGORITHM_SORT_UTILS_HPP__
diff --git a/test/unit/algorithm/tests/test-algorithm-sort.hpp b/test/unit/algorithm/tests/test-algorithm-sort.hpp
index d08f949fae..02daab1c60 100644
--- a/test/unit/algorithm/tests/test-algorithm-sort.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-sort.hpp
@@ -22,60 +22,49 @@
 
 #include "test-algorithm-sort-utils.hpp"
 
-template < typename policy >
-struct PolicySort
-  : PolicySynchronize<policy>
+template <typename policy>
+struct PolicySort : PolicySynchronize<policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::true_type;
 
   std::string m_name;
 
-  PolicySort()
-    : m_name("RAJA::sort<unknown>")
-  { }
+  PolicySort() : m_name("RAJA::sort<unknown>") {}
 
   PolicySort(std::string const& policy_name)
-    : m_name(std::string("RAJA::sort<") + policy_name + std::string(">"))
-  { }
+      : m_name(std::string("RAJA::sort<") + policy_name + std::string(">"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename... Args >
+  template <typename... Args>
   void operator()(Args&&... args)
   {
     RAJA::sort<policy>(std::forward<Args>(args)...);
   }
 };
 
-template < typename policy >
-struct PolicySortPairs
-  : PolicySynchronize<policy>
+template <typename policy>
+struct PolicySortPairs : PolicySynchronize<policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::true_type;
 
   std::string m_name;
 
-  PolicySortPairs()
-    : m_name("RAJA::sort<unknown>[pairs]")
-  { }
+  PolicySortPairs() : m_name("RAJA::sort<unknown>[pairs]") {}
 
   PolicySortPairs(std::string const& policy_name)
-    : m_name(std::string("RAJA::sort<") + policy_name + std::string(">[pairs]"))
-  { }
+      : m_name(std::string("RAJA::sort<") + policy_name +
+               std::string(">[pairs]"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename... Args >
+  template <typename... Args>
   void operator()(Args&&... args)
   {
     RAJA::sort_pairs<policy>(std::forward<Args>(args)...);
@@ -84,41 +73,30 @@ struct PolicySortPairs
 
 
 using SequentialSortSorters =
-  camp::list<
-              PolicySort<RAJA::seq_exec>,
-              PolicySortPairs<RAJA::seq_exec>
-            >;
+    camp::list<PolicySort<RAJA::seq_exec>, PolicySortPairs<RAJA::seq_exec>>;
 
 #if defined(RAJA_ENABLE_OPENMP)
 
 using OpenMPSortSorters =
-  camp::list<
-              PolicySort<RAJA::omp_parallel_for_exec>,
-              PolicySortPairs<RAJA::omp_parallel_for_exec>
-            >;
+    camp::list<PolicySort<RAJA::omp_parallel_for_exec>,
+               PolicySortPairs<RAJA::omp_parallel_for_exec>>;
 
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
 
 using CudaSortSorters =
-  camp::list<
-              PolicySort<RAJA::cuda_exec<128>>,
-              PolicySortPairs<RAJA::cuda_exec<128>>,
-              PolicySort<RAJA::cuda_exec_explicit<128, 2>>
-            >;
+    camp::list<PolicySort<RAJA::cuda_exec<128>>,
+               PolicySortPairs<RAJA::cuda_exec<128>>,
+               PolicySort<RAJA::cuda_exec_explicit<128, 2>>>;
 
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 
-using HipSortSorters =
-  camp::list<
-              PolicySort<RAJA::hip_exec<128>>,
-              PolicySortPairs<RAJA::hip_exec<128>>
-            >;
+using HipSortSorters = camp::list<PolicySort<RAJA::hip_exec<128>>,
+                                  PolicySortPairs<RAJA::hip_exec<128>>>;
 
 #endif
 
-#endif //__TEST_UNIT_ALGORITHM_SORT_HPP__
-
+#endif  //__TEST_UNIT_ALGORITHM_SORT_HPP__
diff --git a/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp b/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp
index 6b33d63497..c4c9189732 100644
--- a/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp
@@ -23,60 +23,50 @@
 #include "test-algorithm-sort-utils.hpp"
 
 
-template < typename policy >
-struct PolicyStableSort
-  : PolicySynchronize<policy>
+template <typename policy>
+struct PolicyStableSort : PolicySynchronize<policy>
 {
-  using sort_category = stable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = stable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::true_type;
 
   std::string m_name;
 
-  PolicyStableSort()
-    : m_name("RAJA::stable_sort<unknown>")
-  { }
+  PolicyStableSort() : m_name("RAJA::stable_sort<unknown>") {}
 
   PolicyStableSort(std::string const& policy_name)
-    : m_name(std::string("RAJA::stable_sort<") + policy_name + std::string(">"))
-  { }
+      : m_name(std::string("RAJA::stable_sort<") + policy_name +
+               std::string(">"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename... Args >
+  template <typename... Args>
   void operator()(Args&&... args)
   {
     RAJA::stable_sort<policy>(std::forward<Args>(args)...);
   }
 };
 
-template < typename policy >
-struct PolicyStableSortPairs
-  : PolicySynchronize<policy>
+template <typename policy>
+struct PolicyStableSortPairs : PolicySynchronize<policy>
 {
-  using sort_category = stable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = stable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::true_type;
 
   std::string m_name;
 
-  PolicyStableSortPairs()
-    : m_name("RAJA::stable_sort<unknown>[pairs]")
-  { }
+  PolicyStableSortPairs() : m_name("RAJA::stable_sort<unknown>[pairs]") {}
 
   PolicyStableSortPairs(std::string const& policy_name)
-    : m_name(std::string("RAJA::stable_sort<") + policy_name + std::string(">[pairs]"))
-  { }
+      : m_name(std::string("RAJA::stable_sort<") + policy_name +
+               std::string(">[pairs]"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename... Args >
+  template <typename... Args>
   void operator()(Args&&... args)
   {
     RAJA::stable_sort_pairs<policy>(std::forward<Args>(args)...);
@@ -84,40 +74,32 @@ struct PolicyStableSortPairs
 };
 
 using SequentialStableSortSorters =
-  camp::list<
-              PolicyStableSort<RAJA::seq_exec>,
-              PolicyStableSortPairs<RAJA::seq_exec>
-            >;
+    camp::list<PolicyStableSort<RAJA::seq_exec>,
+               PolicyStableSortPairs<RAJA::seq_exec>>;
 
 #if defined(RAJA_ENABLE_OPENMP)
 
 using OpenMPStableSortSorters =
-  camp::list<
-              PolicyStableSort<RAJA::omp_parallel_for_exec>,
-              PolicyStableSortPairs<RAJA::omp_parallel_for_exec>
-            >;
+    camp::list<PolicyStableSort<RAJA::omp_parallel_for_exec>,
+               PolicyStableSortPairs<RAJA::omp_parallel_for_exec>>;
 
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
 
 using CudaStableSortSorters =
-  camp::list<
-              PolicyStableSort<RAJA::cuda_exec<128>>,
-              PolicyStableSortPairs<RAJA::cuda_exec<128>>,
-              PolicyStableSort<RAJA::cuda_exec_explicit<128, 2>>
-            >;
+    camp::list<PolicyStableSort<RAJA::cuda_exec<128>>,
+               PolicyStableSortPairs<RAJA::cuda_exec<128>>,
+               PolicyStableSort<RAJA::cuda_exec_explicit<128, 2>>>;
 
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 
 using HipStableSortSorters =
-  camp::list<
-              PolicyStableSort<RAJA::hip_exec<128>>,
-              PolicyStableSortPairs<RAJA::hip_exec<128>>
-            >;
+    camp::list<PolicyStableSort<RAJA::hip_exec<128>>,
+               PolicyStableSortPairs<RAJA::hip_exec<128>>>;
 
 #endif
 
-#endif // __TEST_UNIT_ALGORITHM_STABLE_SORT_HPP__
+#endif  // __TEST_UNIT_ALGORITHM_STABLE_SORT_HPP__
diff --git a/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp b/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp
index 062e0f9b91..52570dbdf1 100644
--- a/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp
@@ -23,49 +23,42 @@
 #include "test-algorithm-reduce-utils.hpp"
 
 
-template < typename test_policy >
-using ForoneSynchronize = PolicySynchronize<test_equivalent_exec_policy<test_policy>>;
+template <typename test_policy>
+using ForoneSynchronize =
+    PolicySynchronize<test_equivalent_exec_policy<test_policy>>;
 
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct BinaryTreeReduce;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct Accumulate;
 
 
-template < typename test_policy >
-struct BinaryTreeReduce<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct BinaryTreeReduce<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using reduce_category = unordered_reduce_tag;
+  using reduce_category  = unordered_reduce_tag;
   using reduce_interface = reduce_interface_tag;
 
-  const char* name()
-  {
-    return "RAJA::binary_tree_reduce";
-  }
+  const char* name() { return "RAJA::binary_tree_reduce"; }
 
-  template < typename T, typename... Args >
+  template <typename T, typename... Args>
   void operator()(T* reduced_value, Args&&... args)
   {
     *reduced_value = RAJA::binary_tree_reduce(std::forward<Args>(args)...);
   }
 };
 
-template < typename test_policy >
-struct Accumulate<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct Accumulate<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using reduce_category = left_fold_reduce_tag;
+  using reduce_category  = left_fold_reduce_tag;
   using reduce_interface = reduce_interface_tag;
 
-  const char* name()
-  {
-    return "RAJA::accumulate";
-  }
+  const char* name() { return "RAJA::accumulate"; }
 
-  template < typename T, typename... Args >
+  template <typename T, typename... Args>
   void operator()(T* reduced_value, Args&&... args)
   {
     *reduced_value = RAJA::accumulate(std::forward<Args>(args)...);
@@ -74,89 +67,90 @@ struct Accumulate<test_policy, RunOnHost>
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
 
-template < typename test_policy >
+template <typename test_policy>
 struct BinaryTreeReduce<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+    : ForoneSynchronize<test_policy>
 {
-  using reduce_category = unordered_reduce_tag;
+  using reduce_category  = unordered_reduce_tag;
   using reduce_interface = reduce_interface_tag;
 
   std::string m_name;
 
   BinaryTreeReduce()
-    : m_name(std::string("RAJA::binary_tree_reduce<") + test_policy_info<test_policy>::name() + std::string(">"))
-  { }
+      : m_name(std::string("RAJA::binary_tree_reduce<") +
+               test_policy_info<test_policy>::name() + std::string(">"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename T, typename Container >
+  template <typename T, typename Container>
   void operator()(T* reduced_value, Container&& c)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      *reduced_value = RAJA::binary_tree_reduce(c);
-    });
+    forone<test_policy>([=] RAJA_DEVICE()
+                        { *reduced_value = RAJA::binary_tree_reduce(c); });
   }
 
-  template < typename T, typename Container >
-  void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal<Container> init)
+  template <typename T, typename Container>
+  void operator()(T* reduced_value,
+                  Container&& c,
+                  RAJA::detail::ContainerVal<Container> init)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      *reduced_value = RAJA::binary_tree_reduce(c, init);
-    });
+    forone<test_policy>(
+        [=] RAJA_DEVICE()
+        { *reduced_value = RAJA::binary_tree_reduce(c, init); });
   }
 
-  template < typename T, typename Container, typename BinaryOp >
-  void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal<Container> init, BinaryOp op)
+  template <typename T, typename Container, typename BinaryOp>
+  void operator()(T* reduced_value,
+                  Container&& c,
+                  RAJA::detail::ContainerVal<Container> init,
+                  BinaryOp op)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      *reduced_value = RAJA::binary_tree_reduce(c, init, op);
-    });
+    forone<test_policy>(
+        [=] RAJA_DEVICE()
+        { *reduced_value = RAJA::binary_tree_reduce(c, init, op); });
   }
 };
 
-template < typename test_policy >
-struct Accumulate<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct Accumulate<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using reduce_category = left_fold_reduce_tag;
+  using reduce_category  = left_fold_reduce_tag;
   using reduce_interface = reduce_interface_tag;
 
   std::string m_name;
 
   Accumulate()
-    : m_name(std::string("RAJA::accumulate<") + test_policy_info<test_policy>::name() + std::string(">"))
-  { }
+      : m_name(std::string("RAJA::accumulate<") +
+               test_policy_info<test_policy>::name() + std::string(">"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename T, typename Container >
+  template <typename T, typename Container>
   void operator()(T* reduced_value, Container&& c)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      *reduced_value = RAJA::accumulate(c);
-    });
+    forone<test_policy>([=] RAJA_DEVICE()
+                        { *reduced_value = RAJA::accumulate(c); });
   }
 
-  template < typename T, typename Container >
-  void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal<Container> init)
+  template <typename T, typename Container>
+  void operator()(T* reduced_value,
+                  Container&& c,
+                  RAJA::detail::ContainerVal<Container> init)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      *reduced_value = RAJA::accumulate(c, init);
-    });
+    forone<test_policy>([=] RAJA_DEVICE()
+                        { *reduced_value = RAJA::accumulate(c, init); });
   }
 
-  template < typename T, typename Container, typename BinaryOp >
-  void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal<Container> init, BinaryOp op)
+  template <typename T, typename Container, typename BinaryOp>
+  void operator()(T* reduced_value,
+                  Container&& c,
+                  RAJA::detail::ContainerVal<Container> init,
+                  BinaryOp op)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      *reduced_value = RAJA::accumulate(c, init, op);
-    });
+    forone<test_policy>([=] RAJA_DEVICE()
+                        { *reduced_value = RAJA::accumulate(c, init, op); });
   }
 };
 
@@ -164,42 +158,24 @@ struct Accumulate<test_policy, RunOnDevice>
 
 
 using SequentialBinaryTreeReduceReducers =
-  camp::list<
-              BinaryTreeReduce<test_seq>
-            >;
+    camp::list<BinaryTreeReduce<test_seq>>;
 
-using SequentialAccumulateReduceReducers =
-  camp::list<
-              Accumulate<test_seq>
-            >;
+using SequentialAccumulateReduceReducers = camp::list<Accumulate<test_seq>>;
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using CudaBinaryTreeReduceReducers =
-  camp::list<
-              BinaryTreeReduce<test_cuda>
-            >;
+using CudaBinaryTreeReduceReducers = camp::list<BinaryTreeReduce<test_cuda>>;
 
-using CudaAccumulateReduceReducers =
-  camp::list<
-              Accumulate<test_cuda>
-            >;
+using CudaAccumulateReduceReducers = camp::list<Accumulate<test_cuda>>;
 
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 
-using HipBinaryTreeReduceReducers =
-  camp::list<
-              BinaryTreeReduce<test_hip>
-            >;
+using HipBinaryTreeReduceReducers = camp::list<BinaryTreeReduce<test_hip>>;
 
-using HipAccumulateReduceReducers =
-  camp::list<
-              Accumulate<test_hip>
-            >;
+using HipAccumulateReduceReducers = camp::list<Accumulate<test_hip>>;
 
 #endif
 
-#endif //__TEST_ALGORITHM_UTIL_REDUCE_HPP__
-
+#endif  //__TEST_ALGORITHM_UTIL_REDUCE_HPP__
diff --git a/test/unit/algorithm/tests/test-algorithm-util-sort.hpp b/test/unit/algorithm/tests/test-algorithm-util-sort.hpp
index 032097d9e3..b972b752cd 100644
--- a/test/unit/algorithm/tests/test-algorithm-util-sort.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-util-sort.hpp
@@ -23,261 +23,233 @@
 #include "test-algorithm-sort-utils.hpp"
 
 
-template < typename test_policy >
-using ForoneSynchronize = PolicySynchronize<test_equivalent_exec_policy<test_policy>>;
+template <typename test_policy>
+using ForoneSynchronize =
+    PolicySynchronize<test_equivalent_exec_policy<test_policy>>;
 
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct InsertionSort;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct InsertionSortPairs;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct ShellSort;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct ShellSortPairs;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct HeapSort;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct HeapSortPairs;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct IntroSort;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct IntroSortPairs;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct MergeSort;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct MergeSortPairs;
 
 
-template < typename test_policy >
-struct InsertionSort<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct InsertionSort<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using sort_category = stable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = stable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::insertion_sort";
-  }
+  const char* name() { return "RAJA::insertion_sort"; }
 
-  template < typename... Args >
+  template <typename... Args>
   void operator()(Args&&... args)
   {
     RAJA::insertion_sort(std::forward<Args>(args)...);
   }
 };
 
-template < typename test_policy >
+template <typename test_policy>
 struct InsertionSortPairs<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+    : ForoneSynchronize<test_policy>
 {
-  using sort_category = stable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = stable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::insertion_sort[pairs]";
-  }
+  const char* name() { return "RAJA::insertion_sort[pairs]"; }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare{})
+                  Compare comp = Compare {})
   {
-    auto c = RAJA::zip_span(keys, vals);
+    auto c        = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
     RAJA::insertion_sort(c, RAJA::compare_first<zip_ref>(comp));
   }
 };
 
-template < typename test_policy >
-struct ShellSort<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct ShellSort<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::shell_sort";
-  }
+  const char* name() { return "RAJA::shell_sort"; }
 
-  template < typename... Args >
+  template <typename... Args>
   void operator()(Args&&... args)
   {
     RAJA::shell_sort(std::forward<Args>(args)...);
   }
 };
 
-template < typename test_policy >
-struct ShellSortPairs<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct ShellSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::shell_sort[pairs]";
-  }
+  const char* name() { return "RAJA::shell_sort[pairs]"; }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare{})
+                  Compare comp = Compare {})
   {
-    auto c = RAJA::zip_span(keys, vals);
+    auto c        = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
     RAJA::shell_sort(c, RAJA::compare_first<zip_ref>(comp));
   }
 };
 
-template < typename test_policy >
-struct HeapSort<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct HeapSort<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::heap_sort";
-  }
+  const char* name() { return "RAJA::heap_sort"; }
 
-  template < typename... Args >
+  template <typename... Args>
   void operator()(Args&&... args)
   {
     RAJA::heap_sort(std::forward<Args>(args)...);
   }
 };
 
-template < typename test_policy >
-struct HeapSortPairs<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct HeapSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::heap_sort[pairs]";
-  }
+  const char* name() { return "RAJA::heap_sort[pairs]"; }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare{})
+                  Compare comp = Compare {})
   {
-    auto c = RAJA::zip_span(keys, vals);
+    auto c        = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
     RAJA::heap_sort(c, RAJA::compare_first<zip_ref>(comp));
   }
 };
 
-template < typename test_policy >
-struct IntroSort<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct IntroSort<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::intro_sort";
-  }
+  const char* name() { return "RAJA::intro_sort"; }
 
-  template < typename... Args >
+  template <typename... Args>
   void operator()(Args&&... args)
   {
     RAJA::intro_sort(std::forward<Args>(args)...);
   }
 };
 
-template < typename test_policy >
-struct IntroSortPairs<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct IntroSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::intro_sort[pairs]";
-  }
+  const char* name() { return "RAJA::intro_sort[pairs]"; }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare{})
+                  Compare comp = Compare {})
   {
-    auto c = RAJA::zip_span(keys, vals);
+    auto c        = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
     RAJA::intro_sort(c, RAJA::compare_first<zip_ref>(comp));
   }
 };
 
-template < typename test_policy >
-struct MergeSort<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct MergeSort<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using sort_category = stable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = stable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::merge_sort";
-  }
+  const char* name() { return "RAJA::merge_sort"; }
 
-  template < typename... Args >
+  template <typename... Args>
   void operator()(Args&&... args)
   {
     RAJA::merge_sort(std::forward<Args>(args)...);
   }
 };
 
-template < typename test_policy >
-struct MergeSortPairs<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct MergeSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using sort_category = stable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = stable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::merge_sort[pairs]";
-  }
+  const char* name() { return "RAJA::merge_sort[pairs]"; }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare{})
+                  Compare comp = Compare {})
   {
-    auto c = RAJA::zip_span(keys, vals);
+    auto c        = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
     RAJA::merge_sort(c, RAJA::compare_first<zip_ref>(comp));
   }
@@ -285,348 +257,319 @@ struct MergeSortPairs<test_policy, RunOnHost>
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
 
-template < typename test_policy >
-struct InsertionSort<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct InsertionSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using sort_category = stable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = stable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   InsertionSort()
-    : m_name(std::string("RAJA::insertion_sort<") + test_policy_info<test_policy>::name() + std::string(">"))
-  { }
+      : m_name(std::string("RAJA::insertion_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename Container >
+  template <typename Container>
   void operator()(Container&& c)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::insertion_sort(c);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::insertion_sort(c); });
   }
 
-  template < typename Container, typename Compare >
+  template <typename Container, typename Compare>
   void operator()(Container&& c, Compare comp)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::insertion_sort(c, comp);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::insertion_sort(c, comp); });
   }
 };
 
-template < typename test_policy >
+template <typename test_policy>
 struct InsertionSortPairs<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+    : ForoneSynchronize<test_policy>
 {
-  using sort_category = stable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = stable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   InsertionSortPairs()
-    : m_name(std::string("RAJA::insertion_sort<") + test_policy_info<test_policy>::name() + std::string(">[pairs]"))
-  { }
+      : m_name(std::string("RAJA::insertion_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare{})
+                  Compare comp = Compare {})
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      auto c = RAJA::zip_span(keys, vals);
-      using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
-      RAJA::insertion_sort(c, RAJA::compare_first<zip_ref>(comp));
-    });
+    forone<test_policy>(
+        [=] RAJA_DEVICE()
+        {
+          auto c        = RAJA::zip_span(keys, vals);
+          using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
+          RAJA::insertion_sort(c, RAJA::compare_first<zip_ref>(comp));
+        });
   }
 };
 
-template < typename test_policy >
-struct ShellSort<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct ShellSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   ShellSort()
-    : m_name(std::string("RAJA::shell_sort<") + test_policy_info<test_policy>::name() + std::string(">"))
-  { }
+      : m_name(std::string("RAJA::shell_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename Container >
+  template <typename Container>
   void operator()(Container&& c)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::shell_sort(c);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::shell_sort(c); });
   }
 
-  template < typename Container, typename Compare >
+  template <typename Container, typename Compare>
   void operator()(Container&& c, Compare comp)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::shell_sort(c, comp);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::shell_sort(c, comp); });
   }
 };
 
-template < typename test_policy >
-struct ShellSortPairs<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct ShellSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   ShellSortPairs()
-    : m_name(std::string("RAJA::shell_sort<") + test_policy_info<test_policy>::name() + std::string(">[pairs]"))
-  { }
+      : m_name(std::string("RAJA::shell_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare{})
+                  Compare comp = Compare {})
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      auto c = RAJA::zip_span(keys, vals);
-      using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
-      RAJA::shell_sort(c, RAJA::compare_first<zip_ref>(comp));
-    });
+    forone<test_policy>(
+        [=] RAJA_DEVICE()
+        {
+          auto c        = RAJA::zip_span(keys, vals);
+          using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
+          RAJA::shell_sort(c, RAJA::compare_first<zip_ref>(comp));
+        });
   }
 };
 
-template < typename test_policy >
-struct HeapSort<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct HeapSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   HeapSort()
-    : m_name(std::string("RAJA::heap_sort<") + test_policy_info<test_policy>::name() + std::string(">"))
-  { }
+      : m_name(std::string("RAJA::heap_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename Container >
+  template <typename Container>
   void operator()(Container c)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::heap_sort(c);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::heap_sort(c); });
   }
 
-  template < typename Container, typename Compare >
+  template <typename Container, typename Compare>
   void operator()(Container c, Compare comp)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::heap_sort(c, comp);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::heap_sort(c, comp); });
   }
 };
 
-template < typename test_policy >
-struct HeapSortPairs<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct HeapSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   HeapSortPairs()
-    : m_name(std::string("RAJA::heap_sort<") + test_policy_info<test_policy>::name() + std::string(">[pairs]"))
-  { }
+      : m_name(std::string("RAJA::heap_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare{})
+                  Compare comp = Compare {})
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      auto c = RAJA::zip_span(keys, vals);
-      using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
-      RAJA::heap_sort(c, RAJA::compare_first<zip_ref>(comp));
-    });
+    forone<test_policy>(
+        [=] RAJA_DEVICE()
+        {
+          auto c        = RAJA::zip_span(keys, vals);
+          using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
+          RAJA::heap_sort(c, RAJA::compare_first<zip_ref>(comp));
+        });
   }
 };
 
-template < typename test_policy >
-struct IntroSort<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct IntroSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   IntroSort()
-    : m_name(std::string("RAJA::intro_sort<") + test_policy_info<test_policy>::name() + std::string(">"))
-  { }
+      : m_name(std::string("RAJA::intro_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename Container >
+  template <typename Container>
   void operator()(Container&& c)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::intro_sort(c);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::intro_sort(c); });
   }
 
-  template < typename Container, typename Compare >
+  template <typename Container, typename Compare>
   void operator()(Container&& c, Compare comp)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::intro_sort(c, comp);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::intro_sort(c, comp); });
   }
 };
 
-template < typename test_policy >
-struct IntroSortPairs<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct IntroSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   IntroSortPairs()
-    : m_name(std::string("RAJA::intro_sort<") + test_policy_info<test_policy>::name() + std::string(">[pairs]"))
-  { }
+      : m_name(std::string("RAJA::intro_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare{})
+                  Compare comp = Compare {})
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      auto c = RAJA::zip_span(keys, vals);
-      using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
-      RAJA::intro_sort(c, RAJA::compare_first<zip_ref>(comp));
-    });
+    forone<test_policy>(
+        [=] RAJA_DEVICE()
+        {
+          auto c        = RAJA::zip_span(keys, vals);
+          using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
+          RAJA::intro_sort(c, RAJA::compare_first<zip_ref>(comp));
+        });
   }
 };
 
-template < typename test_policy >
-struct MergeSort<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct MergeSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   MergeSort()
-    : m_name(std::string("RAJA::merge_sort<") + test_policy_info<test_policy>::name() + std::string(">"))
-  { }
+      : m_name(std::string("RAJA::merge_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename Container >
+  template <typename Container>
   void operator()(Container&& c)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::merge_sort(c);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::merge_sort(c); });
   }
 
-  template < typename Container, typename Compare >
+  template <typename Container, typename Compare>
   void operator()(Container&& c, Compare comp)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::merge_sort(c, comp);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::merge_sort(c, comp); });
   }
 };
 
-template < typename test_policy >
-struct MergeSortPairs<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct MergeSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   MergeSortPairs()
-    : m_name(std::string("RAJA::merge_sort<") + test_policy_info<test_policy>::name() + std::string(">[pairs]"))
-  { }
+      : m_name(std::string("RAJA::merge_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare{})
+                  Compare comp = Compare {})
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      auto c = RAJA::zip_span(keys, vals);
-      using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
-      RAJA::merge_sort(c, RAJA::compare_first<zip_ref>(comp));
-    });
+    forone<test_policy>(
+        [=] RAJA_DEVICE()
+        {
+          auto c        = RAJA::zip_span(keys, vals);
+          using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
+          RAJA::merge_sort(c, RAJA::compare_first<zip_ref>(comp));
+        });
   }
 };
 
@@ -634,102 +577,56 @@ struct MergeSortPairs<test_policy, RunOnDevice>
 
 
 using SequentialInsertionSortSorters =
-  camp::list<
-              InsertionSort<test_seq>,
-              InsertionSortPairs<test_seq>
-            >;
+    camp::list<InsertionSort<test_seq>, InsertionSortPairs<test_seq>>;
 
 using SequentialShellSortSorters =
-  camp::list<
-              ShellSort<test_seq>,
-              ShellSortPairs<test_seq>
-            >;
+    camp::list<ShellSort<test_seq>, ShellSortPairs<test_seq>>;
 
 using SequentialHeapSortSorters =
-  camp::list<
-              HeapSort<test_seq>,
-              HeapSortPairs<test_seq>
-            >;
+    camp::list<HeapSort<test_seq>, HeapSortPairs<test_seq>>;
 
 using SequentialIntroSortSorters =
-  camp::list<
-              IntroSort<test_seq>,
-              IntroSortPairs<test_seq>
-            >;
+    camp::list<IntroSort<test_seq>, IntroSortPairs<test_seq>>;
 
 using SequentialMergeSortSorters =
-  camp::list<
-              MergeSort<test_seq>,
-              MergeSortPairs<test_seq>
-            >;
+    camp::list<MergeSort<test_seq>, MergeSortPairs<test_seq>>;
 
 #if defined(RAJA_ENABLE_CUDA)
 
 using CudaInsertionSortSorters =
-  camp::list<
-              InsertionSort<test_cuda>,
-              InsertionSortPairs<test_cuda>
-            >;
+    camp::list<InsertionSort<test_cuda>, InsertionSortPairs<test_cuda>>;
 
 using CudaShellSortSorters =
-  camp::list<
-              ShellSort<test_cuda>,
-              ShellSortPairs<test_cuda>
-            >;
+    camp::list<ShellSort<test_cuda>, ShellSortPairs<test_cuda>>;
 
 using CudaHeapSortSorters =
-  camp::list<
-              HeapSort<test_cuda>,
-              HeapSortPairs<test_cuda>
-            >;
+    camp::list<HeapSort<test_cuda>, HeapSortPairs<test_cuda>>;
 
 using CudaIntroSortSorters =
-  camp::list<
-              IntroSort<test_cuda>,
-              IntroSortPairs<test_cuda>
-            >;
+    camp::list<IntroSort<test_cuda>, IntroSortPairs<test_cuda>>;
 
 using CudaMergeSortSorters =
-  camp::list<
-              MergeSort<test_cuda>,
-              MergeSortPairs<test_cuda>
-            >;
+    camp::list<MergeSort<test_cuda>, MergeSortPairs<test_cuda>>;
 
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 
 using HipInsertionSortSorters =
-  camp::list<
-              InsertionSort<test_hip>,
-              InsertionSortPairs<test_hip>
-            >;
+    camp::list<InsertionSort<test_hip>, InsertionSortPairs<test_hip>>;
 
 using HipShellSortSorters =
-  camp::list<
-              ShellSort<test_hip>,
-              ShellSortPairs<test_hip>
-            >;
+    camp::list<ShellSort<test_hip>, ShellSortPairs<test_hip>>;
 
 using HipHeapSortSorters =
-  camp::list<
-              HeapSort<test_hip>,
-              HeapSortPairs<test_hip>
-            >;
+    camp::list<HeapSort<test_hip>, HeapSortPairs<test_hip>>;
 
 using HipIntroSortSorters =
-  camp::list<
-              IntroSort<test_hip>,
-              IntroSortPairs<test_hip>
-            >;
+    camp::list<IntroSort<test_hip>, IntroSortPairs<test_hip>>;
 
 using HipMergeSortSorters =
-  camp::list<
-              MergeSort<test_hip>,
-              MergeSortPairs<test_hip>
-            >;
+    camp::list<MergeSort<test_hip>, MergeSortPairs<test_hip>>;
 
 #endif
 
-#endif //__TEST_ALGORITHM_UTIL_SORT_HPP__
-
+#endif  //__TEST_ALGORITHM_UTIL_SORT_HPP__
diff --git a/test/unit/atomic/test-atomic-incdec.cpp b/test/unit/atomic/test-atomic-incdec.cpp
index 6564feeaf5..8a48670cd4 100644
--- a/test/unit/atomic/test-atomic-incdec.cpp
+++ b/test/unit/atomic/test-atomic-incdec.cpp
@@ -6,7 +6,8 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for "wrapping" increment and decrement functions
+/// Source file containing tests for "wrapping" increment and decrement
+/// functions
 ///
 
 #include "RAJA/RAJA.hpp"
@@ -18,31 +19,30 @@
 #endif
 
 using unsigned_types =
-    ::testing::Types<
-                      std::tuple<unsigned int, RAJA::builtin_atomic>,
-                      std::tuple<unsigned int, RAJA::seq_atomic>,
-                      std::tuple<unsigned long long int, RAJA::builtin_atomic>,
-                      std::tuple<unsigned long long int, RAJA::seq_atomic>
+    ::testing::Types<std::tuple<unsigned int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned int, RAJA::seq_atomic>,
+                     std::tuple<unsigned long long int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned long long int, RAJA::seq_atomic>
 #if defined(RAJA_ENABLE_OPENMP)
-                      ,
-                      std::tuple<unsigned int, RAJA::omp_atomic>,
-                      std::tuple<unsigned long long int, RAJA::omp_atomic>
+                     ,
+                     std::tuple<unsigned int, RAJA::omp_atomic>,
+                     std::tuple<unsigned long long int, RAJA::omp_atomic>
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-                      ,
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::cuda_atomic>
+                     ,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                      ,
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::hip_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::hip_atomic>
+                     ,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::hip_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::hip_atomic>
 #endif
-                    >;
+                     >;
 
 // Basic Inc Dec
 
@@ -50,143 +50,144 @@ template <typename T>
 class AtomicBasicIncDecUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicBasicIncDecUnitTest );
+TYPED_TEST_SUITE_P(AtomicBasicIncDecUnitTest);
 
-TYPED_TEST_P( AtomicBasicIncDecUnitTest, BasicIncDecs )
+TYPED_TEST_P(AtomicBasicIncDecUnitTest, BasicIncDecs)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
   // test "wrapping" increment
-  // See: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
+  // See:
+  // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
 
-  T inc_init = (T)0;
+  T inc_init    = (T)0;
   T* inc_result = &inc_init;
 
   // oldval < val, increment oldval
   RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1);
-  ASSERT_EQ( inc_result[0], (T)1 );
+  ASSERT_EQ(inc_result[0], (T)1);
 
   // oldval == val, wrap to 0
   RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1);
-  ASSERT_EQ( inc_result[0], (T)0 );
+  ASSERT_EQ(inc_result[0], (T)0);
 
   // oldval > val, wrap to 0
   inc_result[0] = (T)2;
   RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1);
-  ASSERT_EQ( inc_result[0], (T)0 );
+  ASSERT_EQ(inc_result[0], (T)0);
 
   // test "wrapping" decrement
-  // See: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
+  // See:
+  // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
 
-  T dec_init = (T)1;
+  T dec_init    = (T)1;
   T* dec_result = &dec_init;
 
   // oldval > 0, decrement oldval
   RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1);
-  ASSERT_EQ( dec_result[0], (T)0 );
+  ASSERT_EQ(dec_result[0], (T)0);
 
   // oldval == 0, wrap to val
   RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1);
-  ASSERT_EQ( dec_result[0], (T)1 );
+  ASSERT_EQ(dec_result[0], (T)1);
 
   // oldval > val, wrap to val
   dec_result[0] = (T)3;
   RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1);
-  ASSERT_EQ( dec_result[0], (T)1 );
+  ASSERT_EQ(dec_result[0], (T)1);
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicBasicIncDecUnitTest,
-                             BasicIncDecs
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicBasicIncDecUnitTest, BasicIncDecs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( BasicIncDecUnitTest,
-                                AtomicBasicIncDecUnitTest,
-                                unsigned_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(BasicIncDecUnitTest,
+                               AtomicBasicIncDecUnitTest,
+                               unsigned_types);
 
 
 // Pure CUDA test.
 #if defined(RAJA_ENABLE_CUDA)
 
 using CUDA_unsigned_types =
-    ::testing::Types<
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::cuda_atomic>
-                    >;
+    ::testing::Types<std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>>;
 
 
 template <typename T>
 class AtomicCUDAIncDecUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicCUDAIncDecUnitTest );
+TYPED_TEST_SUITE_P(AtomicCUDAIncDecUnitTest);
 
-GPU_TYPED_TEST_P( AtomicCUDAIncDecUnitTest, CUDAIncDecs )
+GPU_TYPED_TEST_P(AtomicCUDAIncDecUnitTest, CUDAIncDecs)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T * inc_result = nullptr;
-  T * dec_result = nullptr;
-  cudaErrchk(cudaMallocManaged((void **)&inc_result, sizeof(T)));
-  cudaErrchk(cudaMallocManaged((void **)&dec_result, sizeof(T)));
+  T* inc_result = nullptr;
+  T* dec_result = nullptr;
+  cudaErrchk(cudaMallocManaged((void**)&inc_result, sizeof(T)));
+  cudaErrchk(cudaMallocManaged((void**)&dec_result, sizeof(T)));
   cudaErrchk(cudaDeviceSynchronize());
 
   // test "wrapping" increment
-  // See: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
+  // See:
+  // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
 
   inc_result[0] = (T)0;
   // oldval < val, increment oldval
-  forone<test_cuda>( [=] __device__ () {RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1);} );
+  forone<test_cuda>([=] __device__()
+                    { RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( inc_result[0], (T)1 );
+  ASSERT_EQ(inc_result[0], (T)1);
 
   // oldval == val, wrap to 0
-  forone<test_cuda>( [=] __device__ () {RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1);} );
+  forone<test_cuda>([=] __device__()
+                    { RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( inc_result[0], (T)0 );
+  ASSERT_EQ(inc_result[0], (T)0);
 
   // oldval > val, wrap to 0
   inc_result[0] = (T)2;
-  forone<test_cuda>( [=] __device__ () {RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1);} );
+  forone<test_cuda>([=] __device__()
+                    { RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( inc_result[0], (T)0 );
+  ASSERT_EQ(inc_result[0], (T)0);
 
   // test "wrapping" decrement
-  // See: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
+  // See:
+  // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
 
   dec_result[0] = (T)1;
   // oldval > 0, decrement oldval
-  forone<test_cuda>( [=] __device__ () {RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1);} );
+  forone<test_cuda>([=] __device__()
+                    { RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( dec_result[0], (T)0 );
+  ASSERT_EQ(dec_result[0], (T)0);
 
   // oldval == 0, wrap to val
-  forone<test_cuda>( [=] __device__ () {RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1);} );
+  forone<test_cuda>([=] __device__()
+                    { RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( dec_result[0], (T)1 );
+  ASSERT_EQ(dec_result[0], (T)1);
 
   // oldval > val, wrap to val
   dec_result[0] = (T)3;
-  forone<test_cuda>( [=] __device__ () {RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1);} );
+  forone<test_cuda>([=] __device__()
+                    { RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( dec_result[0], (T)1 );
+  ASSERT_EQ(dec_result[0], (T)1);
 
   cudaErrchk(cudaDeviceSynchronize());
   cudaErrchk(cudaFree(inc_result));
   cudaErrchk(cudaFree(dec_result));
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicCUDAIncDecUnitTest,
-                             CUDAIncDecs
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicCUDAIncDecUnitTest, CUDAIncDecs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( CUDAIncDecUnitTest,
-                                AtomicCUDAIncDecUnitTest,
-                                CUDA_unsigned_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDAIncDecUnitTest,
+                               AtomicCUDAIncDecUnitTest,
+                               CUDA_unsigned_types);
 #endif
-
diff --git a/test/unit/atomic/test-atomic-ref-accessors.cpp b/test/unit/atomic/test-atomic-ref-accessors.cpp
index f69813fcbe..677417d98d 100644
--- a/test/unit/atomic/test-atomic-ref-accessors.cpp
+++ b/test/unit/atomic/test-atomic-ref-accessors.cpp
@@ -25,47 +25,44 @@ template <typename T>
 class AtomicRefBasicAccessorUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefBasicAccessorUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefBasicAccessorUnitTest);
 
-TYPED_TEST_P( AtomicRefBasicAccessorUnitTest, BasicAccessors )
+TYPED_TEST_P(AtomicRefBasicAccessorUnitTest, BasicAccessors)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
   // should also work with CUDA
-  T theval = (T)0;
-  T * memaddr = &theval;
+  T theval   = (T)0;
+  T* memaddr = &theval;
   T result;
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test store method with op()
-  test1.store( (T)19 );
-  ASSERT_EQ( test1, (T)19 );
+  test1.store((T)19);
+  ASSERT_EQ(test1, (T)19);
 
   // test assignment operator
   test1 = (T)23;
-  ASSERT_EQ( test1, (T)23 );
+  ASSERT_EQ(test1, (T)23);
 
   // test load method
   test1 = (T)29;
-  ASSERT_EQ( test1.load(), (T)29 );
+  ASSERT_EQ(test1.load(), (T)29);
 
   // test ()
   result = (test1 = (T)31);
-  ASSERT_EQ( test1, (T)31 );
-  ASSERT_EQ( result, (T)31 );
+  ASSERT_EQ(test1, (T)31);
+  ASSERT_EQ(result, (T)31);
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicAccessorUnitTest,
-                             BasicAccessors
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicAccessorUnitTest, BasicAccessors);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( BasicAccessUnitTest,
-                                AtomicRefBasicAccessorUnitTest,
-                                basic_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(BasicAccessUnitTest,
+                               AtomicRefBasicAccessorUnitTest,
+                               basic_types);
 
 // Pure CUDA test.
 #if defined(RAJA_ENABLE_CUDA)
@@ -75,48 +72,58 @@ template <typename T>
 class AtomicRefCUDAAccessorUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefCUDAAccessorUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefCUDAAccessorUnitTest);
 
-GPU_TYPED_TEST_P( AtomicRefCUDAAccessorUnitTest, CUDAAccessors )
+GPU_TYPED_TEST_P(AtomicRefCUDAAccessorUnitTest, CUDAAccessors)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T * memaddr = nullptr;
-  T * result = nullptr;
-  cudaErrchk(cudaMallocManaged((void **)&memaddr, sizeof(T)));
-  cudaErrchk(cudaMallocManaged((void **)&result, sizeof(T)));
+  T* memaddr = nullptr;
+  T* result  = nullptr;
+  cudaErrchk(cudaMallocManaged((void**)&memaddr, sizeof(T)));
+  cudaErrchk(cudaMallocManaged((void**)&result, sizeof(T)));
   cudaErrchk(cudaDeviceSynchronize());
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test store method with op()
-  forone<test_cuda>( [=] __device__ () {test1.store( (T)19 );} );
+  forone<test_cuda>([=] __device__() { test1.store((T)19); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)19 );
+  ASSERT_EQ(test1, (T)19);
 
   // test assignment operator
-  forone<test_cuda>( [=] __device__ () {test1 = (T)23;} );
+  forone<test_cuda>([=] __device__() { test1 = (T)23; });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)23 );
+  ASSERT_EQ(test1, (T)23);
 
   // test load method
-  forone<test_cuda>( [=] __device__ () {test1 = (T)29; result[0] = test1.load();} );
+  forone<test_cuda>(
+      [=] __device__()
+      {
+        test1     = (T)29;
+        result[0] = test1.load();
+      });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)29 );
-  ASSERT_EQ( test1, (T)29 );
+  ASSERT_EQ(result[0], (T)29);
+  ASSERT_EQ(test1, (T)29);
 
   // test T()
-  forone<test_cuda>( [=] __device__ () {test1 = (T)47; result[0] = test1;} );
+  forone<test_cuda>(
+      [=] __device__()
+      {
+        test1     = (T)47;
+        result[0] = test1;
+      });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)47 );
-  ASSERT_EQ( test1, (T)47 );
+  ASSERT_EQ(result[0], (T)47);
+  ASSERT_EQ(test1, (T)47);
 
-  forone<test_cuda>( [=] __device__ () {result[0] = (test1 = (T)31);} );
+  forone<test_cuda>([=] __device__() { result[0] = (test1 = (T)31); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)31 );
-  ASSERT_EQ( test1, (T)31 );
+  ASSERT_EQ(result[0], (T)31);
+  ASSERT_EQ(test1, (T)31);
 
   cudaErrchk(cudaDeviceSynchronize());
 
@@ -124,14 +131,9 @@ GPU_TYPED_TEST_P( AtomicRefCUDAAccessorUnitTest, CUDAAccessors )
   cudaErrchk(cudaFree(result));
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDAAccessorUnitTest,
-                             CUDAAccessors
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAAccessorUnitTest, CUDAAccessors);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( CUDAAccessUnitTest,
-                                AtomicRefCUDAAccessorUnitTest,
-                                CUDA_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDAAccessUnitTest,
+                               AtomicRefCUDAAccessorUnitTest,
+                               CUDA_types);
 #endif
-
-
diff --git a/test/unit/atomic/test-atomic-ref-addsub.cpp b/test/unit/atomic/test-atomic-ref-addsub.cpp
index fba54f77fa..7899b2af5e 100644
--- a/test/unit/atomic/test-atomic-ref-addsub.cpp
+++ b/test/unit/atomic/test-atomic-ref-addsub.cpp
@@ -25,58 +25,55 @@ template <typename T>
 class AtomicRefBasicAddSubUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefBasicAddSubUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefBasicAddSubUnitTest);
 
-TYPED_TEST_P( AtomicRefBasicAddSubUnitTest, BasicAddSubs )
+TYPED_TEST_P(AtomicRefBasicAddSubUnitTest, BasicAddSubs)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T theval = (T)0;
-  T * memaddr = &theval;
+  T theval   = (T)0;
+  T* memaddr = &theval;
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test inc ops
   T val2 = ++test1;
   T val3 = test1++;
-  ASSERT_EQ( test1, (T)2 );
-  ASSERT_EQ( val2, (T)1 );
-  ASSERT_EQ( val3, (T)1 );
+  ASSERT_EQ(test1, (T)2);
+  ASSERT_EQ(val2, (T)1);
+  ASSERT_EQ(val3, (T)1);
 
   // test dec ops
   T val4 = --test1;
   T val5 = test1--;
-  ASSERT_EQ( test1, (T)0 );
-  ASSERT_EQ( val4, (T)1 );
-  ASSERT_EQ( val5, (T)1 );
+  ASSERT_EQ(test1, (T)0);
+  ASSERT_EQ(val4, (T)1);
+  ASSERT_EQ(val5, (T)1);
 
   // test add/sub ops
   T val6 = (test1 += (T)23);
-  ASSERT_EQ( test1, (T)23 );
-  ASSERT_EQ( val6, (T)23 );
+  ASSERT_EQ(test1, (T)23);
+  ASSERT_EQ(val6, (T)23);
   T val7 = (test1 -= (T)22);
-  ASSERT_EQ( test1, (T)1 );
-  ASSERT_EQ( val7, (T)1 );
+  ASSERT_EQ(test1, (T)1);
+  ASSERT_EQ(val7, (T)1);
 
   // test add/sub methods
-  T val8 = test1.fetch_add( (T)23 );
-  ASSERT_EQ( test1, (T)24 );
-  ASSERT_EQ( val8, (T)1 );
-  T val9 = test1.fetch_sub( (T)23 );
-  ASSERT_EQ( test1, (T)1 );
-  ASSERT_EQ( val9, (T)24 );
+  T val8 = test1.fetch_add((T)23);
+  ASSERT_EQ(test1, (T)24);
+  ASSERT_EQ(val8, (T)1);
+  T val9 = test1.fetch_sub((T)23);
+  ASSERT_EQ(test1, (T)1);
+  ASSERT_EQ(val9, (T)24);
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicAddSubUnitTest,
-                             BasicAddSubs
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicAddSubUnitTest, BasicAddSubs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( BasicAddSubUnitTest,
-                                AtomicRefBasicAddSubUnitTest,
-                                basic_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(BasicAddSubUnitTest,
+                               AtomicRefBasicAddSubUnitTest,
+                               basic_types);
 
 
 // Pure CUDA test.
@@ -87,62 +84,62 @@ template <typename T>
 class AtomicRefCUDAAddSubUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefCUDAAddSubUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefCUDAAddSubUnitTest);
 
-GPU_TYPED_TEST_P( AtomicRefCUDAAddSubUnitTest, CUDAAddSubs )
+GPU_TYPED_TEST_P(AtomicRefCUDAAddSubUnitTest, CUDAAddSubs)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T * memaddr = nullptr;
-  T * result1 = nullptr;
-  T * result2 = nullptr;
-  cudaErrchk(cudaMallocManaged((void **)&memaddr, sizeof(T)));
-  cudaErrchk(cudaMallocManaged((void **)&result1, sizeof(T)));
-  cudaErrchk(cudaMallocManaged((void **)&result2, sizeof(T)));
+  T* memaddr = nullptr;
+  T* result1 = nullptr;
+  T* result2 = nullptr;
+  cudaErrchk(cudaMallocManaged((void**)&memaddr, sizeof(T)));
+  cudaErrchk(cudaMallocManaged((void**)&result1, sizeof(T)));
+  cudaErrchk(cudaMallocManaged((void**)&result2, sizeof(T)));
   memaddr[0] = (T)0;
   cudaErrchk(cudaDeviceSynchronize());
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test inc ops
-  forone<test_cuda>( [=] __device__ () {result1[0] = ++test1;} );
+  forone<test_cuda>([=] __device__() { result1[0] = ++test1; });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result1[0], (T)1 );
-  forone<test_cuda>( [=] __device__ () {result2[0] = test1++;} );
+  ASSERT_EQ(result1[0], (T)1);
+  forone<test_cuda>([=] __device__() { result2[0] = test1++; });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)2 );
-  ASSERT_EQ( result2[0], (T)1 );
+  ASSERT_EQ(test1, (T)2);
+  ASSERT_EQ(result2[0], (T)1);
 
   // test dec ops
-  forone<test_cuda>( [=] __device__ () {result1[0] = --test1;} );
+  forone<test_cuda>([=] __device__() { result1[0] = --test1; });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result1[0], (T)1 );
-  forone<test_cuda>( [=] __device__ () {result2[0] = test1--;} );
+  ASSERT_EQ(result1[0], (T)1);
+  forone<test_cuda>([=] __device__() { result2[0] = test1--; });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)0 );
-  ASSERT_EQ( result2[0], (T)1 );
+  ASSERT_EQ(test1, (T)0);
+  ASSERT_EQ(result2[0], (T)1);
 
   // test add/sub ops
-  forone<test_cuda>( [=] __device__ () {result1[0] = (test1 += (T)23);} );
+  forone<test_cuda>([=] __device__() { result1[0] = (test1 += (T)23); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)23 );
-  ASSERT_EQ( result1[0], (T)23 );
-  forone<test_cuda>( [=] __device__ () {result2[0] = (test1 -= (T)22);} );
+  ASSERT_EQ(test1, (T)23);
+  ASSERT_EQ(result1[0], (T)23);
+  forone<test_cuda>([=] __device__() { result2[0] = (test1 -= (T)22); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)1 );
-  ASSERT_EQ( result2[0], (T)1 );
+  ASSERT_EQ(test1, (T)1);
+  ASSERT_EQ(result2[0], (T)1);
 
   // test add/sub methods
-  forone<test_cuda>( [=] __device__ () {result1[0] = test1.fetch_add( (T)23 );} );
+  forone<test_cuda>([=] __device__() { result1[0] = test1.fetch_add((T)23); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)24 );
-  ASSERT_EQ( result1[0], (T)1 );
-  forone<test_cuda>( [=] __device__ () {result2[0] = test1.fetch_sub( (T)23 );} );
+  ASSERT_EQ(test1, (T)24);
+  ASSERT_EQ(result1[0], (T)1);
+  forone<test_cuda>([=] __device__() { result2[0] = test1.fetch_sub((T)23); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)1 );
-  ASSERT_EQ( result2[0], (T)24 );
+  ASSERT_EQ(test1, (T)1);
+  ASSERT_EQ(result2[0], (T)24);
 
   cudaErrchk(cudaDeviceSynchronize());
   cudaErrchk(cudaFree(memaddr));
@@ -150,13 +147,9 @@ GPU_TYPED_TEST_P( AtomicRefCUDAAddSubUnitTest, CUDAAddSubs )
   cudaErrchk(cudaFree(result2));
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDAAddSubUnitTest,
-                             CUDAAddSubs
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAAddSubUnitTest, CUDAAddSubs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( CUDAAddSubUnitTest,
-                                AtomicRefCUDAAddSubUnitTest,
-                                CUDA_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDAAddSubUnitTest,
+                               AtomicRefCUDAAddSubUnitTest,
+                               CUDA_types);
 #endif
-
diff --git a/test/unit/atomic/test-atomic-ref-bitwise.cpp b/test/unit/atomic/test-atomic-ref-bitwise.cpp
index adf49599ca..4c9d840641 100644
--- a/test/unit/atomic/test-atomic-ref-bitwise.cpp
+++ b/test/unit/atomic/test-atomic-ref-bitwise.cpp
@@ -23,80 +23,76 @@ template <typename T>
 class AtomicRefBasicBitwiseUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefBasicBitwiseUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefBasicBitwiseUnitTest);
 
-TYPED_TEST_P( AtomicRefBasicBitwiseUnitTest, BasicBitwises )
+TYPED_TEST_P(AtomicRefBasicBitwiseUnitTest, BasicBitwises)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T theval = (T)1;
-  T * memaddr = &theval;
+  T theval   = (T)1;
+  T* memaddr = &theval;
   T result;
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test and/or
-  result = test1.fetch_and( (T)0 );
-  ASSERT_EQ( result, (T)1 );
-  ASSERT_EQ( test1, (T)0 );
+  result = test1.fetch_and((T)0);
+  ASSERT_EQ(result, (T)1);
+  ASSERT_EQ(test1, (T)0);
 
-  result = test1.fetch_or( (T)1 );
-  ASSERT_EQ( result, (T)0 );
-  ASSERT_EQ( test1, (T)1 );
+  result = test1.fetch_or((T)1);
+  ASSERT_EQ(result, (T)0);
+  ASSERT_EQ(test1, (T)1);
 
   result = (test1 &= (T)0);
-  ASSERT_EQ( test1, (T)0 );
-  ASSERT_EQ( result, (T)0 );
+  ASSERT_EQ(test1, (T)0);
+  ASSERT_EQ(result, (T)0);
 
   result = (test1 |= (T)1);
-  ASSERT_EQ( test1, (T)1 );
-  ASSERT_EQ( result, (T)1 );
+  ASSERT_EQ(test1, (T)1);
+  ASSERT_EQ(result, (T)1);
 
   // test xor
-  result = test1.fetch_xor( (T)1 );
-  ASSERT_EQ( result, (T)1 );
-  ASSERT_EQ( test1, (T)0 );
+  result = test1.fetch_xor((T)1);
+  ASSERT_EQ(result, (T)1);
+  ASSERT_EQ(test1, (T)0);
 
   result = (test1 ^= (T)1);
-  ASSERT_EQ( test1, (T)1 );
-  ASSERT_EQ( result, (T)1 );
+  ASSERT_EQ(test1, (T)1);
+  ASSERT_EQ(result, (T)1);
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicBitwiseUnitTest,
-                             BasicBitwises
-                           );
-
-using basic_types = 
-    ::testing::Types<
-                      std::tuple<int, RAJA::builtin_atomic>,
-                      std::tuple<int, RAJA::seq_atomic>,
-                      std::tuple<unsigned int, RAJA::builtin_atomic>,
-                      std::tuple<unsigned int, RAJA::seq_atomic>,
-                      std::tuple<unsigned long long int, RAJA::builtin_atomic>,
-                      std::tuple<unsigned long long int, RAJA::seq_atomic>
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicBitwiseUnitTest, BasicBitwises);
+
+using basic_types =
+    ::testing::Types<std::tuple<int, RAJA::builtin_atomic>,
+                     std::tuple<int, RAJA::seq_atomic>,
+                     std::tuple<unsigned int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned int, RAJA::seq_atomic>,
+                     std::tuple<unsigned long long int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned long long int, RAJA::seq_atomic>
 #if defined(RAJA_ENABLE_OPENMP)
-                      ,
-                      std::tuple<int, RAJA::omp_atomic>,
-                      std::tuple<unsigned int, RAJA::omp_atomic>,
-                      std::tuple<unsigned long long int, RAJA::omp_atomic>
+                     ,
+                     std::tuple<int, RAJA::omp_atomic>,
+                     std::tuple<unsigned int, RAJA::omp_atomic>,
+                     std::tuple<unsigned long long int, RAJA::omp_atomic>
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-                      ,
-                      std::tuple<int, RAJA::auto_atomic>,
-                      std::tuple<int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::cuda_atomic>
+                     ,
+                     std::tuple<int, RAJA::auto_atomic>,
+                     std::tuple<int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>
 #endif
-                    >;
+                     >;
 
-INSTANTIATE_TYPED_TEST_SUITE_P( BasicBitwiseUnitTest,
-                                AtomicRefBasicBitwiseUnitTest,
-                                basic_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(BasicBitwiseUnitTest,
+                               AtomicRefBasicBitwiseUnitTest,
+                               basic_types);
 
 
 // Pure CUDA test.
@@ -107,77 +103,71 @@ template <typename T>
 class AtomicRefCUDABitwiseUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefCUDABitwiseUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefCUDABitwiseUnitTest);
 
-GPU_TYPED_TEST_P( AtomicRefCUDABitwiseUnitTest, CUDABitwises )
+GPU_TYPED_TEST_P(AtomicRefCUDABitwiseUnitTest, CUDABitwises)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T * memaddr = nullptr;
-  T * result = nullptr;
-  cudaErrchk(cudaMallocManaged((void **)&memaddr, sizeof(T)));
-  cudaErrchk(cudaMallocManaged((void **)&result, sizeof(T)));
+  T* memaddr = nullptr;
+  T* result  = nullptr;
+  cudaErrchk(cudaMallocManaged((void**)&memaddr, sizeof(T)));
+  cudaErrchk(cudaMallocManaged((void**)&result, sizeof(T)));
   memaddr[0] = (T)1;
   cudaErrchk(cudaDeviceSynchronize());
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test and/or
-  forone<test_cuda>( [=] __device__ () {result[0] = test1.fetch_and( (T)0 );} );
+  forone<test_cuda>([=] __device__() { result[0] = test1.fetch_and((T)0); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)1 );
-  ASSERT_EQ( test1, (T)0 );
+  ASSERT_EQ(result[0], (T)1);
+  ASSERT_EQ(test1, (T)0);
 
-  forone<test_cuda>( [=] __device__ () {result[0] = test1.fetch_or( (T)1 );} );
+  forone<test_cuda>([=] __device__() { result[0] = test1.fetch_or((T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)0 );
-  ASSERT_EQ( test1, (T)1 );
+  ASSERT_EQ(result[0], (T)0);
+  ASSERT_EQ(test1, (T)1);
 
-  forone<test_cuda>( [=] __device__ () {result[0] = (test1 &= (T)0);} );
+  forone<test_cuda>([=] __device__() { result[0] = (test1 &= (T)0); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)0 );
-  ASSERT_EQ( result[0], (T)0 );
+  ASSERT_EQ(test1, (T)0);
+  ASSERT_EQ(result[0], (T)0);
 
-  forone<test_cuda>( [=] __device__ () {result[0] = (test1 |= (T)1);} );
+  forone<test_cuda>([=] __device__() { result[0] = (test1 |= (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)1 );
-  ASSERT_EQ( result[0], (T)1 );
+  ASSERT_EQ(test1, (T)1);
+  ASSERT_EQ(result[0], (T)1);
 
   // test xor
-  forone<test_cuda>( [=] __device__ () {result[0] = test1.fetch_xor( (T)1 );} );
+  forone<test_cuda>([=] __device__() { result[0] = test1.fetch_xor((T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)1 );
-  ASSERT_EQ( test1, (T)0 );
+  ASSERT_EQ(result[0], (T)1);
+  ASSERT_EQ(test1, (T)0);
 
-  forone<test_cuda>( [=] __device__ () {result[0] = (test1 ^= (T)1);} );
+  forone<test_cuda>([=] __device__() { result[0] = (test1 ^= (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)1 );
-  ASSERT_EQ( result[0], (T)1 );
+  ASSERT_EQ(test1, (T)1);
+  ASSERT_EQ(result[0], (T)1);
 
   cudaErrchk(cudaDeviceSynchronize());
   cudaErrchk(cudaFree(memaddr));
   cudaErrchk(cudaFree(result));
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDABitwiseUnitTest,
-                             CUDABitwises
-                           );
-
-using CUDA_types = 
-    ::testing::Types<
-                      std::tuple<int, RAJA::auto_atomic>,
-                      std::tuple<int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::cuda_atomic>
-                    >;
-
-INSTANTIATE_TYPED_TEST_SUITE_P( CUDABitwiseUnitTest,
-                                AtomicRefCUDABitwiseUnitTest,
-                                CUDA_types
-                              );
-#endif
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDABitwiseUnitTest, CUDABitwises);
 
+using CUDA_types =
+    ::testing::Types<std::tuple<int, RAJA::auto_atomic>,
+                     std::tuple<int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDABitwiseUnitTest,
+                               AtomicRefCUDABitwiseUnitTest,
+                               CUDA_types);
+#endif
diff --git a/test/unit/atomic/test-atomic-ref-constructor.cpp b/test/unit/atomic/test-atomic-ref-constructor.cpp
index 619e3ebf20..c86c6be8fb 100644
--- a/test/unit/atomic/test-atomic-ref-constructor.cpp
+++ b/test/unit/atomic/test-atomic-ref-constructor.cpp
@@ -6,7 +6,8 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for atomic ref constructors (and use of getPointer for verification)
+/// Source file containing tests for atomic ref constructors (and use of
+/// getPointer for verification)
 ///
 
 #include "RAJA/RAJA.hpp"
@@ -30,38 +31,33 @@ TYPED_TEST_SUITE_P(AtomicRefDefaultConstructorUnitTest);
 template <typename T>
 void DefaultPolConstructors()
 {
-  T * memaddr = nullptr;
+  T* memaddr = nullptr;
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T> test1( memaddr );
+  RAJA::AtomicRef<T> test1(memaddr);
 
-  ASSERT_EQ( test1.getPointer(), nullptr );
+  ASSERT_EQ(test1.getPointer(), nullptr);
 
   // ref constructor
-  RAJA::AtomicRef<T> const & reft1 = test1;
-  RAJA::AtomicRef<T> reftest1( reft1 );
+  RAJA::AtomicRef<T> const& reft1 = test1;
+  RAJA::AtomicRef<T> reftest1(reft1);
 
-  ASSERT_EQ( reftest1.getPointer(), nullptr );
+  ASSERT_EQ(reftest1.getPointer(), nullptr);
 }
 
-TYPED_TEST_P( AtomicRefDefaultConstructorUnitTest, DefaultPolConstructors )
+TYPED_TEST_P(AtomicRefDefaultConstructorUnitTest, DefaultPolConstructors)
 {
   DefaultPolConstructors<TypeParam>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefDefaultConstructorUnitTest,
-                             DefaultPolConstructors
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefDefaultConstructorUnitTest,
+                            DefaultPolConstructors);
 
-using default_types = ::testing::Types< int,
-                                      float,
-                                      double
-                                    >;
+using default_types = ::testing::Types<int, float, double>;
 
-INSTANTIATE_TYPED_TEST_SUITE_P( DefaultConstrUnitTest,
-                                AtomicRefDefaultConstructorUnitTest,
-                                default_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(DefaultConstrUnitTest,
+                               AtomicRefDefaultConstructorUnitTest,
+                               default_types);
 
 // Basic Constructors with policies
 
@@ -69,35 +65,33 @@ template <typename T>
 class AtomicRefBasicConstructorUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefBasicConstructorUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefBasicConstructorUnitTest);
 
-TYPED_TEST_P( AtomicRefBasicConstructorUnitTest, BasicConstructors )
+TYPED_TEST_P(AtomicRefBasicConstructorUnitTest, BasicConstructors)
 {
-  using NumericType = typename std::tuple_element<0, TypeParam>::type;
+  using NumericType  = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  NumericType * memaddr = nullptr;
+  NumericType* memaddr = nullptr;
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<NumericType, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<NumericType, AtomicPolicy> test1(memaddr);
 
-  ASSERT_EQ( test1.getPointer(), nullptr );
+  ASSERT_EQ(test1.getPointer(), nullptr);
 
   // ref constructor
-  RAJA::AtomicRef<NumericType, AtomicPolicy> const & reft1 = test1;
-  RAJA::AtomicRef<NumericType, AtomicPolicy> reftest1( reft1 );
+  RAJA::AtomicRef<NumericType, AtomicPolicy> const& reft1 = test1;
+  RAJA::AtomicRef<NumericType, AtomicPolicy> reftest1(reft1);
 
-  ASSERT_EQ( reftest1.getPointer(), nullptr );
+  ASSERT_EQ(reftest1.getPointer(), nullptr);
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicConstructorUnitTest,
-                             BasicConstructors
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicConstructorUnitTest,
+                            BasicConstructors);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( BasicConstrUnitTest,
-                                AtomicRefBasicConstructorUnitTest,
-                                basic_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(BasicConstrUnitTest,
+                               AtomicRefBasicConstructorUnitTest,
+                               basic_types);
 
 // Pure CUDA test.
 #if defined(RAJA_ENABLE_CUDA)
@@ -109,44 +103,40 @@ class AtomicRefCUDAConstructorUnitTest : public ::testing::Test
 
 TYPED_TEST_SUITE_P(AtomicRefCUDAConstructorUnitTest);
 
-GPU_TYPED_TEST_P( AtomicRefCUDAConstructorUnitTest, CUDAConstructors )
+GPU_TYPED_TEST_P(AtomicRefCUDAConstructorUnitTest, CUDAConstructors)
 {
-  using NumericType = typename std::tuple_element<0, TypeParam>::type;
+  using NumericType  = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  NumericType * memaddr = nullptr;
-  NumericType * proxy = nullptr;
-  cudaErrchk(cudaMallocManaged((void **)&proxy, sizeof(NumericType)));
+  NumericType* memaddr = nullptr;
+  NumericType* proxy   = nullptr;
+  cudaErrchk(cudaMallocManaged((void**)&proxy, sizeof(NumericType)));
   proxy = memaddr;
   cudaErrchk(cudaDeviceSynchronize());
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<NumericType, AtomicPolicy> test0( memaddr );
-  RAJA::AtomicRef<NumericType, AtomicPolicy> test1( proxy );
+  RAJA::AtomicRef<NumericType, AtomicPolicy> test0(memaddr);
+  RAJA::AtomicRef<NumericType, AtomicPolicy> test1(proxy);
 
-  forone<test_cuda>( [=] __device__ () {test1.getPointer();} );
+  forone<test_cuda>([=] __device__() { test1.getPointer(); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test0.getPointer(), nullptr );
-  ASSERT_EQ( test1.getPointer(), nullptr );
+  ASSERT_EQ(test0.getPointer(), nullptr);
+  ASSERT_EQ(test1.getPointer(), nullptr);
 
   // ref constructor
-  RAJA::AtomicRef<NumericType, AtomicPolicy> const & reft1 = test1;
-  RAJA::AtomicRef<NumericType, AtomicPolicy> reftest1( reft1 );
-  forone<test_cuda>( [=] __device__ () {reftest1.getPointer();} );
+  RAJA::AtomicRef<NumericType, AtomicPolicy> const& reft1 = test1;
+  RAJA::AtomicRef<NumericType, AtomicPolicy> reftest1(reft1);
+  forone<test_cuda>([=] __device__() { reftest1.getPointer(); });
   cudaErrchk(cudaDeviceSynchronize());
 
-  ASSERT_EQ( reftest1.getPointer(), nullptr );
+  ASSERT_EQ(reftest1.getPointer(), nullptr);
 
   cudaErrchk(cudaFree(proxy));
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDAConstructorUnitTest,
-                             CUDAConstructors
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAConstructorUnitTest, CUDAConstructors);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( CUDAConstrUnitTest,
-                                AtomicRefCUDAConstructorUnitTest,
-                                CUDA_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDAConstrUnitTest,
+                               AtomicRefCUDAConstructorUnitTest,
+                               CUDA_types);
 #endif
-
diff --git a/test/unit/atomic/test-atomic-ref-exchanges.cpp b/test/unit/atomic/test-atomic-ref-exchanges.cpp
index 18fa1e4819..842e18d319 100644
--- a/test/unit/atomic/test-atomic-ref-exchanges.cpp
+++ b/test/unit/atomic/test-atomic-ref-exchanges.cpp
@@ -23,91 +23,87 @@ template <typename T>
 class AtomicRefBasicExchangeUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefBasicExchangeUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefBasicExchangeUnitTest);
 
-TYPED_TEST_P( AtomicRefBasicExchangeUnitTest, BasicExchanges )
+TYPED_TEST_P(AtomicRefBasicExchangeUnitTest, BasicExchanges)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T swapper = (T)91;
-  T theval = (T)0;
-  T * memaddr = &theval;
+  T swapper  = (T)91;
+  T theval   = (T)0;
+  T* memaddr = &theval;
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test exchange method
-  swapper = test1.exchange( swapper );
-  ASSERT_EQ( test1, (T)91 );
-  ASSERT_EQ( swapper, (T)0 );
+  swapper = test1.exchange(swapper);
+  ASSERT_EQ(test1, (T)91);
+  ASSERT_EQ(swapper, (T)0);
 
   // test CAS method
-  swapper = test1.CAS( (T)91, swapper );
-  ASSERT_EQ( test1, (T)0 );
-  ASSERT_EQ( swapper, (T)91 );
+  swapper = test1.CAS((T)91, swapper);
+  ASSERT_EQ(test1, (T)0);
+  ASSERT_EQ(swapper, (T)91);
 
 
   bool result = true;
-  T testval = (T)19;
-  T & valref = testval;
+  T testval   = (T)19;
+  T& valref   = testval;
 
   // test strong exchange method
-  result = test1.compare_exchange_strong( valref, testval );
-  ASSERT_EQ( result, false );
-  ASSERT_EQ( test1, (T)0 );
-  ASSERT_EQ( swapper, (T)91 );
-  ASSERT_EQ( testval, (T)0 );
+  result = test1.compare_exchange_strong(valref, testval);
+  ASSERT_EQ(result, false);
+  ASSERT_EQ(test1, (T)0);
+  ASSERT_EQ(swapper, (T)91);
+  ASSERT_EQ(testval, (T)0);
 
   // test weak exchange method (same as strong exchange)
-  result = test1.compare_exchange_weak( valref, swapper );
-  ASSERT_EQ( result, true );
-  ASSERT_EQ( test1, (T)91 );
-  ASSERT_EQ( swapper, (T)91 );
-  ASSERT_EQ( testval, (T)0 );
+  result = test1.compare_exchange_weak(valref, swapper);
+  ASSERT_EQ(result, true);
+  ASSERT_EQ(test1, (T)91);
+  ASSERT_EQ(swapper, (T)91);
+  ASSERT_EQ(testval, (T)0);
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicExchangeUnitTest,
-                             BasicExchanges
-                           );
-
-using basic_types = 
-    ::testing::Types<
-                      std::tuple<int, RAJA::builtin_atomic>,
-                      std::tuple<int, RAJA::seq_atomic>,
-                      std::tuple<unsigned int, RAJA::builtin_atomic>,
-                      std::tuple<unsigned int, RAJA::seq_atomic>,
-                      std::tuple<unsigned long long int, RAJA::builtin_atomic>,
-                      std::tuple<unsigned long long int, RAJA::seq_atomic>,
-                      std::tuple<float, RAJA::builtin_atomic>,
-                      std::tuple<float, RAJA::seq_atomic>,
-                      std::tuple<double, RAJA::builtin_atomic>,
-                      std::tuple<double, RAJA::seq_atomic>
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicExchangeUnitTest, BasicExchanges);
+
+using basic_types =
+    ::testing::Types<std::tuple<int, RAJA::builtin_atomic>,
+                     std::tuple<int, RAJA::seq_atomic>,
+                     std::tuple<unsigned int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned int, RAJA::seq_atomic>,
+                     std::tuple<unsigned long long int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned long long int, RAJA::seq_atomic>,
+                     std::tuple<float, RAJA::builtin_atomic>,
+                     std::tuple<float, RAJA::seq_atomic>,
+                     std::tuple<double, RAJA::builtin_atomic>,
+                     std::tuple<double, RAJA::seq_atomic>
 #if defined(RAJA_ENABLE_OPENMP)
-                      ,
-                      std::tuple<int, RAJA::omp_atomic>,
-                      std::tuple<unsigned int, RAJA::omp_atomic>,
-                      std::tuple<unsigned long long int, RAJA::omp_atomic>,
-                      std::tuple<float, RAJA::omp_atomic>,
-                      std::tuple<double, RAJA::omp_atomic>
+                     ,
+                     std::tuple<int, RAJA::omp_atomic>,
+                     std::tuple<unsigned int, RAJA::omp_atomic>,
+                     std::tuple<unsigned long long int, RAJA::omp_atomic>,
+                     std::tuple<float, RAJA::omp_atomic>,
+                     std::tuple<double, RAJA::omp_atomic>
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-                      ,
-                      std::tuple<int, RAJA::auto_atomic>,
-                      std::tuple<int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::cuda_atomic>,
-                      std::tuple<float, RAJA::auto_atomic>,
-                      std::tuple<float, RAJA::cuda_atomic>
+                     ,
+                     std::tuple<int, RAJA::auto_atomic>,
+                     std::tuple<int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>,
+                     std::tuple<float, RAJA::auto_atomic>,
+                     std::tuple<float, RAJA::cuda_atomic>
 #endif
-                    >;
+                     >;
 
-INSTANTIATE_TYPED_TEST_SUITE_P( BasicExchangeUnitTest,
-                                AtomicRefBasicExchangeUnitTest,
-                                basic_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(BasicExchangeUnitTest,
+                               AtomicRefBasicExchangeUnitTest,
+                               basic_types);
 
 
 // Pure CUDA test.
@@ -118,17 +114,17 @@ template <typename T>
 class AtomicRefCUDAExchangeUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefCUDAExchangeUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefCUDAExchangeUnitTest);
 
-GPU_TYPED_TEST_P( AtomicRefCUDAExchangeUnitTest, CUDAExchanges )
+GPU_TYPED_TEST_P(AtomicRefCUDAExchangeUnitTest, CUDAExchanges)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T * swapper = nullptr;
-  T * memaddr = nullptr;
-  T * testval = nullptr;
-  bool * result = nullptr;
+  T* swapper   = nullptr;
+  T* memaddr   = nullptr;
+  T* testval   = nullptr;
+  bool* result = nullptr;
   cudaErrchk(cudaMallocManaged(&swapper, sizeof(T)));
   cudaErrchk(cudaMallocManaged(&memaddr, sizeof(T)));
   cudaErrchk(cudaMallocManaged(&testval, sizeof(T)));
@@ -136,39 +132,45 @@ GPU_TYPED_TEST_P( AtomicRefCUDAExchangeUnitTest, CUDAExchanges )
   swapper[0] = (T)91;
   memaddr[0] = (T)0;
   testval[0] = (T)19;
-  result[0] = true;
+  result[0]  = true;
   cudaErrchk(cudaDeviceSynchronize());
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test exchange method
-  forone<test_cuda>( [=] __device__ () {swapper[0] = test1.exchange( swapper[0] );} );
+  forone<test_cuda>([=] __device__()
+                    { swapper[0] = test1.exchange(swapper[0]); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)91 );
-  ASSERT_EQ( swapper[0], (T)0 );
+  ASSERT_EQ(test1, (T)91);
+  ASSERT_EQ(swapper[0], (T)0);
 
   // test CAS method
-  forone<test_cuda>( [=] __device__ () {swapper[0] = test1.CAS( (T)91, swapper[0] );} );
+  forone<test_cuda>([=] __device__()
+                    { swapper[0] = test1.CAS((T)91, swapper[0]); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)0 );
-  ASSERT_EQ( swapper[0], (T)91 );
+  ASSERT_EQ(test1, (T)0);
+  ASSERT_EQ(swapper[0], (T)91);
 
   // test strong exchange method
-  forone<test_cuda>( [=] __device__ () {result[0] = test1.compare_exchange_strong( testval[0], testval[0] );} );
+  forone<test_cuda>(
+      [=] __device__()
+      { result[0] = test1.compare_exchange_strong(testval[0], testval[0]); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], false );
-  ASSERT_EQ( test1, (T)0 );
-  ASSERT_EQ( swapper[0], (T)91 );
-  ASSERT_EQ( testval[0], (T)0 );
+  ASSERT_EQ(result[0], false);
+  ASSERT_EQ(test1, (T)0);
+  ASSERT_EQ(swapper[0], (T)91);
+  ASSERT_EQ(testval[0], (T)0);
 
   // test weak exchange method (same as strong exchange)
-  forone<test_cuda>( [=] __device__ () {result[0] = test1.compare_exchange_weak( testval[0], swapper[0] );} );
+  forone<test_cuda>(
+      [=] __device__()
+      { result[0] = test1.compare_exchange_weak(testval[0], swapper[0]); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], true );
-  ASSERT_EQ( test1, (T)91 );
-  ASSERT_EQ( swapper[0], (T)91 );
-  ASSERT_EQ( testval[0], (T)0 );
+  ASSERT_EQ(result[0], true);
+  ASSERT_EQ(test1, (T)91);
+  ASSERT_EQ(swapper[0], (T)91);
+  ASSERT_EQ(testval[0], (T)0);
 
   cudaErrchk(cudaDeviceSynchronize());
   cudaErrchk(cudaFree(swapper));
@@ -177,25 +179,19 @@ GPU_TYPED_TEST_P( AtomicRefCUDAExchangeUnitTest, CUDAExchanges )
   cudaErrchk(cudaFree(result));
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDAExchangeUnitTest,
-                             CUDAExchanges
-                           );
-
-using CUDA_types = 
-    ::testing::Types<
-                      std::tuple<int, RAJA::auto_atomic>,
-                      std::tuple<int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::cuda_atomic>,
-                      std::tuple<float, RAJA::auto_atomic>,
-                      std::tuple<float, RAJA::auto_atomic>
-                    >;
-
-INSTANTIATE_TYPED_TEST_SUITE_P( CUDAExchangeUnitTest,
-                                AtomicRefCUDAExchangeUnitTest,
-                                CUDA_types
-                              );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAExchangeUnitTest, CUDAExchanges);
+
+using CUDA_types =
+    ::testing::Types<std::tuple<int, RAJA::auto_atomic>,
+                     std::tuple<int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>,
+                     std::tuple<float, RAJA::auto_atomic>,
+                     std::tuple<float, RAJA::auto_atomic>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDAExchangeUnitTest,
+                               AtomicRefCUDAExchangeUnitTest,
+                               CUDA_types);
 #endif
-
diff --git a/test/unit/atomic/test-atomic-ref-minmax.cpp b/test/unit/atomic/test-atomic-ref-minmax.cpp
index a35ea15164..10d4825616 100644
--- a/test/unit/atomic/test-atomic-ref-minmax.cpp
+++ b/test/unit/atomic/test-atomic-ref-minmax.cpp
@@ -25,47 +25,44 @@ template <typename T>
 class AtomicRefBasicMinMaxUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefBasicMinMaxUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefBasicMinMaxUnitTest);
 
-TYPED_TEST_P( AtomicRefBasicMinMaxUnitTest, BasicMinMaxs )
+TYPED_TEST_P(AtomicRefBasicMinMaxUnitTest, BasicMinMaxs)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T theval = (T)91;
-  T * memaddr = &theval;
+  T theval   = (T)91;
+  T* memaddr = &theval;
   T result;
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test min
-  result = test1.fetch_min( (T)87 );
-  ASSERT_EQ( result, (T)91 );
-  ASSERT_EQ( test1, (T)87 );
+  result = test1.fetch_min((T)87);
+  ASSERT_EQ(result, (T)91);
+  ASSERT_EQ(test1, (T)87);
 
-  result = test1.min( (T)83 );
-  ASSERT_EQ( result, (T)83 );
-  ASSERT_EQ( test1, (T)83 );
+  result = test1.min((T)83);
+  ASSERT_EQ(result, (T)83);
+  ASSERT_EQ(test1, (T)83);
 
   // test max
-  result = test1.fetch_max( (T)87 );
-  ASSERT_EQ( result, (T)83 );
-  ASSERT_EQ( test1, (T)87 );
+  result = test1.fetch_max((T)87);
+  ASSERT_EQ(result, (T)83);
+  ASSERT_EQ(test1, (T)87);
 
-  result = test1.max( (T)91 );
-  ASSERT_EQ( result, (T)91 );
-  ASSERT_EQ( test1, (T)91 );
+  result = test1.max((T)91);
+  ASSERT_EQ(result, (T)91);
+  ASSERT_EQ(test1, (T)91);
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicMinMaxUnitTest,
-                             BasicMinMaxs
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicMinMaxUnitTest, BasicMinMaxs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( BasicMinMaxUnitTest,
-                                AtomicRefBasicMinMaxUnitTest,
-                                basic_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(BasicMinMaxUnitTest,
+                               AtomicRefBasicMinMaxUnitTest,
+                               basic_types);
 
 // Pure CUDA test.
 #if defined(RAJA_ENABLE_CUDA)
@@ -75,57 +72,53 @@ template <typename T>
 class AtomicRefCUDAMinMaxUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefCUDAMinMaxUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefCUDAMinMaxUnitTest);
 
-GPU_TYPED_TEST_P( AtomicRefCUDAMinMaxUnitTest, CUDAMinMaxs )
+GPU_TYPED_TEST_P(AtomicRefCUDAMinMaxUnitTest, CUDAMinMaxs)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T * result = nullptr;
-  T * memaddr = nullptr;
+  T* result  = nullptr;
+  T* memaddr = nullptr;
   cudaErrchk(cudaMallocManaged(&result, sizeof(T)));
   cudaErrchk(cudaMallocManaged(&memaddr, sizeof(T)));
   memaddr[0] = (T)91;
   cudaErrchk(cudaDeviceSynchronize());
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test min
-  forone<test_cuda>( [=] __device__ () {result[0] = test1.fetch_min( (T)87 );} );
+  forone<test_cuda>([=] __device__() { result[0] = test1.fetch_min((T)87); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)91 );
-  ASSERT_EQ( test1, (T)87 );
+  ASSERT_EQ(result[0], (T)91);
+  ASSERT_EQ(test1, (T)87);
 
-  forone<test_cuda>( [=] __device__ () {result[0] = test1.min( (T)83 );} );
+  forone<test_cuda>([=] __device__() { result[0] = test1.min((T)83); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)83 );
-  ASSERT_EQ( test1, (T)83 );
+  ASSERT_EQ(result[0], (T)83);
+  ASSERT_EQ(test1, (T)83);
 
   // test max
-  forone<test_cuda>( [=] __device__ () {result[0] = test1.fetch_max( (T)87 );} );
+  forone<test_cuda>([=] __device__() { result[0] = test1.fetch_max((T)87); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)83 );
-  ASSERT_EQ( test1, (T)87 );
+  ASSERT_EQ(result[0], (T)83);
+  ASSERT_EQ(test1, (T)87);
 
-  forone<test_cuda>( [=] __device__ () {result[0] = test1.max( (T)91 );} );
+  forone<test_cuda>([=] __device__() { result[0] = test1.max((T)91); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)91 );
-  ASSERT_EQ( test1, (T)91 );
+  ASSERT_EQ(result[0], (T)91);
+  ASSERT_EQ(test1, (T)91);
 
   cudaErrchk(cudaDeviceSynchronize());
   cudaErrchk(cudaFree(result));
   cudaErrchk(cudaFree(memaddr));
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDAMinMaxUnitTest,
-                             CUDAMinMaxs
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAMinMaxUnitTest, CUDAMinMaxs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( CUDAMinMaxUnitTest,
-                                AtomicRefCUDAMinMaxUnitTest,
-                                CUDA_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDAMinMaxUnitTest,
+                               AtomicRefCUDAMinMaxUnitTest,
+                               CUDA_types);
 #endif
-
diff --git a/test/unit/atomic/test-atomic-ref.hpp b/test/unit/atomic/test-atomic-ref.hpp
index f5b7dd2943..6805c432cd 100644
--- a/test/unit/atomic/test-atomic-ref.hpp
+++ b/test/unit/atomic/test-atomic-ref.hpp
@@ -13,66 +13,62 @@
 #include "RAJA_gtest.hpp"
 
 using basic_types =
-    ::testing::Types<
-                      std::tuple<int, RAJA::builtin_atomic>,
-                      std::tuple<int, RAJA::seq_atomic>,
-                      std::tuple<unsigned int, RAJA::builtin_atomic>,
-                      std::tuple<unsigned int, RAJA::seq_atomic>,
-                      std::tuple<unsigned long long int, RAJA::builtin_atomic>,
-                      std::tuple<unsigned long long int, RAJA::seq_atomic>,
-                      std::tuple<float, RAJA::builtin_atomic>,
-                      std::tuple<float, RAJA::seq_atomic>,
-                      std::tuple<double, RAJA::builtin_atomic>,
-                      std::tuple<double, RAJA::seq_atomic>
+    ::testing::Types<std::tuple<int, RAJA::builtin_atomic>,
+                     std::tuple<int, RAJA::seq_atomic>,
+                     std::tuple<unsigned int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned int, RAJA::seq_atomic>,
+                     std::tuple<unsigned long long int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned long long int, RAJA::seq_atomic>,
+                     std::tuple<float, RAJA::builtin_atomic>,
+                     std::tuple<float, RAJA::seq_atomic>,
+                     std::tuple<double, RAJA::builtin_atomic>,
+                     std::tuple<double, RAJA::seq_atomic>
 #if defined(RAJA_ENABLE_OPENMP)
-                      ,
-                      std::tuple<int, RAJA::omp_atomic>,
-                      std::tuple<unsigned int, RAJA::omp_atomic>,
-                      std::tuple<unsigned long long int, RAJA::omp_atomic>,
-                      std::tuple<float, RAJA::omp_atomic>,
-                      std::tuple<double, RAJA::omp_atomic>
+                     ,
+                     std::tuple<int, RAJA::omp_atomic>,
+                     std::tuple<unsigned int, RAJA::omp_atomic>,
+                     std::tuple<unsigned long long int, RAJA::omp_atomic>,
+                     std::tuple<float, RAJA::omp_atomic>,
+                     std::tuple<double, RAJA::omp_atomic>
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-                      ,
-                      std::tuple<int, RAJA::auto_atomic>,
-                      std::tuple<int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::cuda_atomic>,
-                      std::tuple<float, RAJA::auto_atomic>,
-                      std::tuple<float, RAJA::cuda_atomic>,
-                      std::tuple<double, RAJA::auto_atomic>,
-                      std::tuple<double, RAJA::cuda_atomic>
+                     ,
+                     std::tuple<int, RAJA::auto_atomic>,
+                     std::tuple<int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>,
+                     std::tuple<float, RAJA::auto_atomic>,
+                     std::tuple<float, RAJA::cuda_atomic>,
+                     std::tuple<double, RAJA::auto_atomic>,
+                     std::tuple<double, RAJA::cuda_atomic>
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                      ,
-                      std::tuple<int, RAJA::auto_atomic>,
-                      std::tuple<int, RAJA::hip_atomic>,
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::hip_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::hip_atomic>,
-                      std::tuple<float, RAJA::auto_atomic>,
-                      std::tuple<float, RAJA::hip_atomic>,
-                      std::tuple<double, RAJA::auto_atomic>,
-                      std::tuple<double, RAJA::hip_atomic>
+                     ,
+                     std::tuple<int, RAJA::auto_atomic>,
+                     std::tuple<int, RAJA::hip_atomic>,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::hip_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::hip_atomic>,
+                     std::tuple<float, RAJA::auto_atomic>,
+                     std::tuple<float, RAJA::hip_atomic>,
+                     std::tuple<double, RAJA::auto_atomic>,
+                     std::tuple<double, RAJA::hip_atomic>
 #endif
-                    >;
+                     >;
 
 #if defined(RAJA_ENABLE_CUDA)
 using CUDA_types =
-    ::testing::Types<
-                      std::tuple<int, RAJA::auto_atomic>,
-                      std::tuple<int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::cuda_atomic>,
-                      std::tuple<float, RAJA::auto_atomic>,
-                      std::tuple<float, RAJA::auto_atomic>,
-                      std::tuple<double, RAJA::cuda_atomic>,
-                      std::tuple<double, RAJA::cuda_atomic>
-                    >;
+    ::testing::Types<std::tuple<int, RAJA::auto_atomic>,
+                     std::tuple<int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>,
+                     std::tuple<float, RAJA::auto_atomic>,
+                     std::tuple<float, RAJA::auto_atomic>,
+                     std::tuple<double, RAJA::cuda_atomic>,
+                     std::tuple<double, RAJA::cuda_atomic>>;
 #endif
-
diff --git a/test/unit/hip/test-synchronize.cpp b/test/unit/hip/test-synchronize.cpp
index 1b0ce0a414..0ddc91bb9a 100644
--- a/test/unit/hip/test-synchronize.cpp
+++ b/test/unit/hip/test-synchronize.cpp
@@ -14,22 +14,21 @@
 GPU_TEST(SynchronizeUnitTest, HIP)
 {
 
-  double* managed_data = (double*) malloc(sizeof(double)*50);
+  double* managed_data = (double*)malloc(sizeof(double) * 50);
   double* d_managed_data;
-  hipMalloc(&d_managed_data, sizeof(double)*50);
+  hipMalloc(&d_managed_data, sizeof(double) * 50);
 
-  RAJA::forall<RAJA::hip_exec_async<256>>( RAJA::RangeSegment(0, 50),
-    [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
-    d_managed_data[i] = 1.0 * i;
-  });
+  RAJA::forall<RAJA::hip_exec_async<256>>(
+      RAJA::RangeSegment(0, 50), [=] RAJA_HOST_DEVICE(RAJA::Index_type i)
+      { d_managed_data[i] = 1.0 * i; });
   RAJA::synchronize<RAJA::hip_synchronize>();
 
-  hipMemcpy(managed_data, d_managed_data, sizeof(double)*50, hipMemcpyDeviceToHost);
+  hipMemcpy(managed_data, d_managed_data, sizeof(double) * 50,
+            hipMemcpyDeviceToHost);
 
-  RAJA::forall<RAJA::seq_exec>( RAJA::RangeSegment(0, 50),
-    [=](RAJA::Index_type i) {
-    EXPECT_EQ(managed_data[i], 1.0 * i);
-  });
+  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 50),
+                               [=](RAJA::Index_type i)
+                               { EXPECT_EQ(managed_data[i], 1.0 * i); });
 
   free(managed_data);
   hipFree(d_managed_data);
diff --git a/test/unit/index/test-indexset.cpp b/test/unit/index/test-indexset.cpp
index 8d0b282624..af9027a11c 100644
--- a/test/unit/index/test-indexset.cpp
+++ b/test/unit/index/test-indexset.cpp
@@ -17,7 +17,7 @@
 // Resource object used to construct list segment objects with indices
 // living in host (CPU) memory. Used in all tests.
 //
-  camp::resources::Resource host_res{camp::resources::Host()};
+camp::resources::Resource host_res {camp::resources::Host()};
 
 
 TEST(IndexSetUnitTest, Empty)
@@ -34,13 +34,13 @@ TEST(IndexSetUnitTest, Empty)
 
 TEST(IndexSetUnitTest, ConstructAndCompareSegments)
 {
-  using RangeSegType = RAJA::TypedRangeSegment<int>;
+  using RangeSegType  = RAJA::TypedRangeSegment<int>;
   using RIndexSetType = RAJA::TypedIndexSet<RangeSegType>;
   RIndexSetType isr;
   ASSERT_EQ((size_t)1, isr.getNumTypes());
   isr.push_back(RangeSegType(1, 3));
   isr.push_front(RangeSegType(0, 1));
-  ASSERT_EQ(2, isr.size()); 
+  ASSERT_EQ(2, isr.size());
   ASSERT_EQ(size_t(3), isr.getLength());
   const RangeSegType& rs0 = isr.getSegment<const RangeSegType>(0);
   const RangeSegType& rs1 = isr.getSegment<const RangeSegType>(1);
@@ -56,17 +56,17 @@ TEST(IndexSetUnitTest, ConstructAndCompareSegments)
   ASSERT_NE(isr.size(), isr2.size());
   ASSERT_EQ(isr.getLength(), isr2.getLength());
 
-  using ListSegType = RAJA::TypedListSegment<int>; 
+  using ListSegType    = RAJA::TypedListSegment<int>;
   using RLIndexSetType = RAJA::TypedIndexSet<RangeSegType, ListSegType>;
   RLIndexSetType isrl;
   ASSERT_EQ(size_t(2), isrl.getNumTypes());
-  int idx[ ] = {0, 2, 4, 5};
-  ListSegType lseg(idx, 4, host_res); 
+  int idx[] = {0, 2, 4, 5};
+  ListSegType lseg(idx, 4, host_res);
   isrl.push_back(lseg);
   isrl.push_back(RangeSegType(6, 8));
-  ASSERT_EQ(2, isrl.size()); 
+  ASSERT_EQ(2, isrl.size());
   ASSERT_EQ(size_t(6), isrl.getLength());
-  const ListSegType ls0 = isrl.getSegment<const ListSegType>(0);
+  const ListSegType ls0   = isrl.getSegment<const ListSegType>(0);
   const RangeSegType rs11 = isrl.getSegment<const RangeSegType>(1);
   ASSERT_EQ(4, ls0.size());
   ASSERT_EQ(2, rs11.size());
@@ -84,7 +84,7 @@ TEST(IndexSetUnitTest, ConstructAndCompareSegments)
 
 TEST(IndexSetUnitTest, Swap)
 {
-  using RangeSegType = RAJA::TypedRangeSegment<int>;
+  using RangeSegType  = RAJA::TypedRangeSegment<int>;
   using RIndexSetType = RAJA::TypedIndexSet<RangeSegType>;
   RIndexSetType iset1;
   RangeSegType range(0, 10);
@@ -109,7 +109,7 @@ TEST(IndexSetUnitTest, Swap)
 
 TEST(IndexSetUnitTest, Slice)
 {
-  using RangeSegType = RAJA::TypedRangeSegment<int>;
+  using RangeSegType  = RAJA::TypedRangeSegment<int>;
   using RIndexSetType = RAJA::TypedIndexSet<RangeSegType>;
   RIndexSetType iset1;
   RangeSegType range1(0, 2);
@@ -138,7 +138,7 @@ TEST(IndexSetUnitTest, Slice)
   ASSERT_EQ(8, *rs22.begin());
   ASSERT_EQ(10, *rs22.end());
 
-  int segs[ ] = {0, 3};
+  int segs[]          = {0, 3};
   RIndexSetType iset3 = iset1.createSlice(segs, 2);
   ASSERT_EQ(2, iset3.size());
   ASSERT_EQ(size_t(4), iset3.getLength());
@@ -165,19 +165,19 @@ TEST(IndexSetUnitTest, Slice)
 
 TEST(IndexSetUnitTest, ConditionalEvenIndices)
 {
-  using RangeSegType = RAJA::TypedRangeSegment<int>;
-  using ListSegType = RAJA::TypedListSegment<int>; 
+  using RangeSegType   = RAJA::TypedRangeSegment<int>;
+  using ListSegType    = RAJA::TypedListSegment<int>;
   using RLIndexSetType = RAJA::TypedIndexSet<RangeSegType, ListSegType>;
   RLIndexSetType iset;
 
   iset.push_back(RangeSegType(0, 6));
-  int idx[ ] = {7, 8, 10, 11};
-  ListSegType lseg(idx, 4, host_res); 
+  int idx[] = {7, 8, 10, 11};
+  ListSegType lseg(idx, 4, host_res);
   iset.push_back(lseg);
   iset.push_back(RangeSegType(13, 17));
 
   RAJA::RAJAVec<int> ref_even_indices;
-  ref_even_indices.push_back(0); 
+  ref_even_indices.push_back(0);
   ref_even_indices.push_back(2);
   ref_even_indices.push_back(4);
   ref_even_indices.push_back(8);
@@ -186,19 +186,18 @@ TEST(IndexSetUnitTest, ConditionalEvenIndices)
   ref_even_indices.push_back(16);
 
   RAJA::RAJAVec<int> even_indices;
-  getIndicesConditional(even_indices, iset, [] (int idx) {
-    return !(idx % 2);
-  });
+  getIndicesConditional(even_indices, iset, [](int idx) { return !(idx % 2); });
 
   EXPECT_EQ(even_indices.size(), ref_even_indices.size());
-  for (size_t i = 0; i < ref_even_indices.size(); ++i) {
+  for (size_t i = 0; i < ref_even_indices.size(); ++i)
+  {
     EXPECT_EQ(even_indices[i], ref_even_indices[i]);
   }
 }
 
 TEST(IndexSetUnitTest, ConditionalLessThan100Indices)
 {
-  using RangeSegType = RAJA::TypedRangeSegment<int>;
+  using RangeSegType  = RAJA::TypedRangeSegment<int>;
   using RIndexSetType = RAJA::TypedIndexSet<RangeSegType>;
   RIndexSetType iset;
 
@@ -215,12 +214,12 @@ TEST(IndexSetUnitTest, ConditionalLessThan100Indices)
   ref_lt100_indices.push_back(99);
 
   RAJA::RAJAVec<int> lt100_indices;
-  getIndicesConditional(lt100_indices, iset, [] (int idx) {
-    return (idx < 100);
-  });
+  getIndicesConditional(lt100_indices, iset,
+                        [](int idx) { return (idx < 100); });
 
   EXPECT_EQ(lt100_indices.size(), ref_lt100_indices.size());
-  for (size_t i = 0; i < ref_lt100_indices.size(); ++i) {
+  for (size_t i = 0; i < ref_lt100_indices.size(); ++i)
+  {
     EXPECT_EQ(lt100_indices[i], ref_lt100_indices[i]);
   }
 }
diff --git a/test/unit/index/test-indexvalue.cpp b/test/unit/index/test-indexvalue.cpp
index fad47715e9..ca148d2c91 100644
--- a/test/unit/index/test-indexvalue.cpp
+++ b/test/unit/index/test-indexvalue.cpp
@@ -13,8 +13,9 @@
 
 #include "RAJA_unit-test-types.hpp"
 
-template<typename T>
-class IndexValueUnitTest : public ::testing::Test {};
+template <typename T>
+class IndexValueUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(IndexValueUnitTest, UnitIndexTypes);
 
@@ -181,7 +182,7 @@ TYPED_TEST(IndexValueUnitTest, IndexTypeArith)
   ASSERT_EQ(StrongTypeIndex(8), a);
   ASSERT_EQ(RAJA::Index_type(2), b);
 
-  
+
   RAJA_INDEX_VALUE_T(TestType, TypeParam, "Test Type");
   TestType c(8);
   RAJA::Index_type d(2);
diff --git a/test/unit/index/test-listsegment.cpp b/test/unit/index/test-listsegment.cpp
index 2ea0004b83..f810aac1ed 100644
--- a/test/unit/index/test-listsegment.cpp
+++ b/test/unit/index/test-listsegment.cpp
@@ -17,26 +17,28 @@
 
 #include <vector>
 
-template<typename T>
-class ListSegmentUnitTest : public ::testing::Test {};
+template <typename T>
+class ListSegmentUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(ListSegmentUnitTest, UnitIndexTypes);
 
 //
 // Resource object used to construct list segment objects with indices
-// living in host (CPU) memory. Used in all tests in this file. 
+// living in host (CPU) memory. Used in all tests in this file.
 //
-camp::resources::Resource host_res{camp::resources::Host()};
+camp::resources::Resource host_res {camp::resources::Host()};
 
 
 TYPED_TEST(ListSegmentUnitTest, Constructors)
 {
   std::vector<TypeParam> idx;
-  for (TypeParam i = 0; i < 5; ++i){
+  for (TypeParam i = 0; i < 5; ++i)
+  {
     idx.push_back(i);
   }
 
-  RAJA::TypedListSegment<TypeParam> list1( &idx[0], idx.size(), host_res);
+  RAJA::TypedListSegment<TypeParam> list1(&idx[0], idx.size(), host_res);
   ASSERT_EQ(list1.size(), idx.size());
   ASSERT_EQ(list1.getIndexOwnership(), RAJA::Owned);
 
@@ -50,20 +52,21 @@ TYPED_TEST(ListSegmentUnitTest, Constructors)
 
   RAJA::TypedListSegment<TypeParam> container(idx, host_res);
   ASSERT_EQ(container.getIndexOwnership(), RAJA::Owned);
-  ASSERT_EQ(moved, container); 
+  ASSERT_EQ(moved, container);
 }
 
 TYPED_TEST(ListSegmentUnitTest, Swaps)
 {
   std::vector<TypeParam> idx1;
   std::vector<TypeParam> idx2;
-  for (TypeParam i = 0; i < 5; ++i){
+  for (TypeParam i = 0; i < 5; ++i)
+  {
     idx1.push_back(i);
-    idx2.push_back(i+5);
+    idx2.push_back(i + 5);
   }
 
-  RAJA::TypedListSegment<TypeParam> list1( idx1, host_res );
-  RAJA::TypedListSegment<TypeParam> list2( idx2, host_res );
+  RAJA::TypedListSegment<TypeParam> list1(idx1, host_res);
+  RAJA::TypedListSegment<TypeParam> list2(idx2, host_res);
   auto list3 = RAJA::TypedListSegment<TypeParam>(list1);
   auto list4 = RAJA::TypedListSegment<TypeParam>(list2);
 
@@ -80,26 +83,25 @@ TYPED_TEST(ListSegmentUnitTest, Swaps)
 
 TYPED_TEST(ListSegmentUnitTest, Equality)
 {
-  std::vector<TypeParam> idx1{5,3,1,2};
-  RAJA::TypedListSegment<TypeParam> list( idx1, host_res );
+  std::vector<TypeParam> idx1 {5, 3, 1, 2};
+  RAJA::TypedListSegment<TypeParam> list(idx1, host_res);
 
-  std::vector<TypeParam> idx2{2,1,3,5};
-  
-  ASSERT_EQ(list.indicesEqual( &idx2.begin()[0], idx2.size() ), false);
+  std::vector<TypeParam> idx2 {2, 1, 3, 5};
 
-  std::reverse( idx2.begin(), idx2.end() );
+  ASSERT_EQ(list.indicesEqual(&idx2.begin()[0], idx2.size()), false);
 
-  ASSERT_EQ(list.indicesEqual( &idx2.begin()[0], idx2.size() ), true);
+  std::reverse(idx2.begin(), idx2.end());
+
+  ASSERT_EQ(list.indicesEqual(&idx2.begin()[0], idx2.size()), true);
 }
 
 TYPED_TEST(ListSegmentUnitTest, Iterators)
 {
-  std::vector<TypeParam> idx1{5,3,1,2};
-  RAJA::TypedListSegment<TypeParam> list( idx1, host_res );
+  std::vector<TypeParam> idx1 {5, 3, 1, 2};
+  RAJA::TypedListSegment<TypeParam> list(idx1, host_res);
 
   ASSERT_EQ(TypeParam(5), *list.begin());
-  ASSERT_EQ(TypeParam(2), *(list.end()-1));
+  ASSERT_EQ(TypeParam(2), *(list.end() - 1));
 
   ASSERT_EQ(4, list.size());
 }
-
diff --git a/test/unit/index/test-rangesegment.cpp b/test/unit/index/test-rangesegment.cpp
index be82671682..fbed2a15bd 100644
--- a/test/unit/index/test-rangesegment.cpp
+++ b/test/unit/index/test-rangesegment.cpp
@@ -13,18 +13,20 @@
 
 #include "RAJA_unit-test-types.hpp"
 
-template<typename T>
-class RangeSegmentUnitTest : public ::testing::Test {};
+template <typename T>
+class RangeSegmentUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(RangeSegmentUnitTest, UnitIndexTypes);
 
 
-template< typename T, typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
+template <typename T,
+          typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
 void NegativeRangeSegConstructorsTest()
-{
-}
+{}
 
-template< typename T, typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
+template <typename T,
+          typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
 void NegativeRangeSegConstructorsTest()
 {
   RAJA::TypedRangeSegment<T> r1(-10, 7);
@@ -74,12 +76,13 @@ TYPED_TEST(RangeSegmentUnitTest, Swaps)
   ASSERT_EQ(r2, r3);
 }
 
-template< typename T, typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
+template <typename T,
+          typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
 void NegativeRangeSegIteratorsTest()
-{
-}
+{}
 
-template< typename T, typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
+template <typename T,
+          typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
 void NegativeRangeSegIteratorsTest()
 {
   RAJA::TypedRangeSegment<T> r3(-2, 100);
@@ -100,13 +103,14 @@ TYPED_TEST(RangeSegmentUnitTest, Iterators)
 }
 
 template <typename IDX_TYPE,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
 void runNegativeIndexSliceTests()
-{
-}
+{}
 
 template <typename IDX_TYPE,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
 void runNegativeIndexSliceTests()
 {
   auto r1 = RAJA::TypedRangeSegment<IDX_TYPE>(-4, 4);
@@ -128,15 +132,15 @@ void runNegativeIndexSliceTests()
 TYPED_TEST(RangeSegmentUnitTest, Slices)
 {
   auto r1 = RAJA::TypedRangeSegment<TypeParam>(0, 125);
-  auto s1 = r1.slice(10,100);
+  auto s1 = r1.slice(10, 100);
 
   ASSERT_EQ(TypeParam(10), *s1.begin());
   ASSERT_EQ(TypeParam(110), *(s1.end()));
   ASSERT_EQ(TypeParam(100), s1.size());
 
- 
+
   auto r2 = RAJA::TypedRangeSegment<TypeParam>(0, 12);
-  auto s2 = r2.slice(1,13);
+  auto s2 = r2.slice(1, 13);
 
   ASSERT_EQ(TypeParam(1), *s2.begin());
   ASSERT_EQ(TypeParam(12), *(s2.end()));
@@ -144,7 +148,7 @@ TYPED_TEST(RangeSegmentUnitTest, Slices)
 
 
   auto r3 = RAJA::TypedRangeSegment<TypeParam>(1, 125);
-  auto s3 = r3.slice(10,100);
+  auto s3 = r3.slice(10, 100);
 
   ASSERT_EQ(TypeParam(11), *s3.begin());
   ASSERT_EQ(TypeParam(111), *(s3.end()));
@@ -160,7 +164,7 @@ TYPED_TEST(RangeSegmentUnitTest, Equality)
 
   ASSERT_EQ(r1, r2);
 
-  auto r3 = RAJA::TypedRangeSegment<TypeParam>(10,15);
+  auto r3 = RAJA::TypedRangeSegment<TypeParam>(10, 15);
 
   ASSERT_NE(r1, r3);
 }
diff --git a/test/unit/index/test-rangestridesegment.cpp b/test/unit/index/test-rangestridesegment.cpp
index 5b64e17b5c..ef640547d2 100644
--- a/test/unit/index/test-rangestridesegment.cpp
+++ b/test/unit/index/test-rangestridesegment.cpp
@@ -13,64 +13,67 @@
 
 #include "RAJA_unit-test-types.hpp"
 
-template<typename T>
-class RangeStrideSegmentUnitTest : public ::testing::Test {};
+template <typename T>
+class RangeStrideSegmentUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(RangeStrideSegmentUnitTest, UnitIndexTypes);
 
 
 TYPED_TEST(RangeStrideSegmentUnitTest, Constructors)
 {
-    RAJA::TypedRangeStrideSegment<TypeParam> first(0, 10, 2);
-    RAJA::TypedRangeStrideSegment<TypeParam> copied(first);
-    ASSERT_EQ(first, copied);
-    RAJA::TypedRangeStrideSegment<TypeParam> moved(std::move(first));
-    ASSERT_EQ(moved, copied);
+  RAJA::TypedRangeStrideSegment<TypeParam> first(0, 10, 2);
+  RAJA::TypedRangeStrideSegment<TypeParam> copied(first);
+  ASSERT_EQ(first, copied);
+  RAJA::TypedRangeStrideSegment<TypeParam> moved(std::move(first));
+  ASSERT_EQ(moved, copied);
 }
 
 TYPED_TEST(RangeStrideSegmentUnitTest, Assignments)
 {
-    auto r = RAJA::make_strided_range<TypeParam>(static_cast<TypeParam>(0), 
-                                                 static_cast<TypeParam>(5), 
-                                                 static_cast<typename std::make_signed<TypeParam>::type>(3));
-    RAJA::TypedRangeStrideSegment<TypeParam> seg1 = r;
-    ASSERT_EQ(r, seg1);
-    RAJA::TypedRangeStrideSegment<TypeParam> seg2 = std::move(r);
-    ASSERT_EQ(seg2, seg1);
+  auto r = RAJA::make_strided_range<TypeParam>(
+      static_cast<TypeParam>(0), static_cast<TypeParam>(5),
+      static_cast<typename std::make_signed<TypeParam>::type>(3));
+  RAJA::TypedRangeStrideSegment<TypeParam> seg1 = r;
+  ASSERT_EQ(r, seg1);
+  RAJA::TypedRangeStrideSegment<TypeParam> seg2 = std::move(r);
+  ASSERT_EQ(seg2, seg1);
 }
 
 TYPED_TEST(RangeStrideSegmentUnitTest, Swaps)
 {
-    RAJA::TypedRangeStrideSegment<TypeParam> r1(0, 5, 2);
-    RAJA::TypedRangeStrideSegment<TypeParam> r2(1, 6, 1);
-    RAJA::TypedRangeStrideSegment<TypeParam> r3(r1);
-    RAJA::TypedRangeStrideSegment<TypeParam> r4(r2);
-    std::swap(r1, r2);
-    ASSERT_EQ(r1, r4);
-    ASSERT_EQ(r2, r3);
+  RAJA::TypedRangeStrideSegment<TypeParam> r1(0, 5, 2);
+  RAJA::TypedRangeStrideSegment<TypeParam> r2(1, 6, 1);
+  RAJA::TypedRangeStrideSegment<TypeParam> r3(r1);
+  RAJA::TypedRangeStrideSegment<TypeParam> r4(r2);
+  std::swap(r1, r2);
+  ASSERT_EQ(r1, r4);
+  ASSERT_EQ(r2, r3);
 }
 
 TYPED_TEST(RangeStrideSegmentUnitTest, Iterators)
 {
-    RAJA::TypedRangeStrideSegment<TypeParam> r1(0, 100, 4);
-    ASSERT_EQ(TypeParam(0), *r1.begin());
-    ASSERT_EQ(TypeParam(96), *(--r1.end()));
-    using difftype_t = decltype(std::distance(r1.begin(), r1.end()));
-    ASSERT_EQ(difftype_t(25), r1.end() - r1.begin());
-    ASSERT_EQ(difftype_t(25), std::distance(r1.begin(), r1.end()));
-    ASSERT_EQ(difftype_t(25), r1.size());
+  RAJA::TypedRangeStrideSegment<TypeParam> r1(0, 100, 4);
+  ASSERT_EQ(TypeParam(0), *r1.begin());
+  ASSERT_EQ(TypeParam(96), *(--r1.end()));
+  using difftype_t = decltype(std::distance(r1.begin(), r1.end()));
+  ASSERT_EQ(difftype_t(25), r1.end() - r1.begin());
+  ASSERT_EQ(difftype_t(25), std::distance(r1.begin(), r1.end()));
+  ASSERT_EQ(difftype_t(25), r1.size());
 }
 
-template< typename T, typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
+template <typename T,
+          typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
 void NegativeRangeStrideTestSizes()
-{
-}
+{}
 
-template< typename T, typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
+template <typename T,
+          typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
 void NegativeRangeStrideTestSizes()
 {
   RAJA::TypedRangeStrideSegment<T> segment16(-10, -2, 2);
-  using difftype_t = decltype(std::distance(segment16.begin(), segment16.end()));
+  using difftype_t =
+      decltype(std::distance(segment16.begin(), segment16.end()));
   ASSERT_EQ(segment16.size(), difftype_t(4));
 
   RAJA::TypedRangeStrideSegment<T> segment17(-5, 5, 2);
@@ -118,13 +121,16 @@ TYPED_TEST(RangeStrideSegmentUnitTest, Sizes)
   ASSERT_EQ(segment11.size(), difftype_t(2));
 
   // PRIMES
-  RAJA::TypedRangeStrideSegment<TypeParam> segment12(0, 7, 3);  // should produce 0,3,6
+  RAJA::TypedRangeStrideSegment<TypeParam> segment12(
+      0, 7, 3);  // should produce 0,3,6
   ASSERT_EQ(segment12.size(), difftype_t(3));
 
-  RAJA::TypedRangeStrideSegment<TypeParam> segment13(0, 13, 3);  // should produce 0,3,6,9,12
+  RAJA::TypedRangeStrideSegment<TypeParam> segment13(
+      0, 13, 3);  // should produce 0,3,6,9,12
   ASSERT_EQ(segment13.size(), difftype_t(5));
 
-  RAJA::TypedRangeStrideSegment<TypeParam> segment14(0, 17, 5);  // should produce 0,5,10,15
+  RAJA::TypedRangeStrideSegment<TypeParam> segment14(
+      0, 17, 5);  // should produce 0,5,10,15
   ASSERT_EQ(segment14.size(), difftype_t(4));
 
   // NEGATIVE STRIDE
@@ -136,13 +142,14 @@ TYPED_TEST(RangeStrideSegmentUnitTest, Sizes)
 }
 
 template <typename IDX_TYPE,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
 void runNegativeIndexStrideSliceTests()
-{
-}
+{}
 
-template <typename IDX_TYPE, 
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
+template <typename IDX_TYPE,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
 void runNegativeIndexStrideSliceTests()
 {
   auto r1 = RAJA::TypedRangeStrideSegment<IDX_TYPE>(10, -1, -1);
@@ -167,14 +174,14 @@ void runNegativeIndexStrideSliceTests()
   ASSERT_EQ(IDX_TYPE(-2), *s3.begin());
   ASSERT_EQ(IDX_TYPE(2), *s3.end());
   ASSERT_EQ(size_t(2), size_t(s3.size()));
- 
-  
+
+
   auto r4 = RAJA::TypedRangeStrideSegment<IDX_TYPE>(-9, -1, 1);
   auto s4 = r4.slice(3, 6);
 
   ASSERT_EQ(IDX_TYPE(-6), *s4.begin());
   ASSERT_EQ(IDX_TYPE(-1), *s4.end());
-  ASSERT_EQ(size_t(5), size_t(s4.size())); 
+  ASSERT_EQ(size_t(5), size_t(s4.size()));
 }
 
 TYPED_TEST(RangeStrideSegmentUnitTest, Slices)
@@ -222,5 +229,5 @@ TYPED_TEST(RangeStrideSegmentUnitTest, Equality)
 
   auto r3 = RAJA::TypedRangeStrideSegment<TypeParam>(1, 10, 1);
 
-  ASSERT_TRUE( !(r1 == r3));
+  ASSERT_TRUE(!(r1 == r3));
 }
diff --git a/test/unit/indexing/test-indexing.hpp b/test/unit/indexing/test-indexing.hpp
index 21038542ee..9978f17832 100644
--- a/test/unit/indexing/test-indexing.hpp
+++ b/test/unit/indexing/test-indexing.hpp
@@ -18,30 +18,26 @@
 // List of named_dims
 //
 using NamedDimensionTypeList =
-    camp::list<
-                camp::integral_constant<RAJA::named_dim, RAJA::named_dim::x>,
-                camp::integral_constant<RAJA::named_dim, RAJA::named_dim::y>,
-                camp::integral_constant<RAJA::named_dim, RAJA::named_dim::z>
-              >;
+    camp::list<camp::integral_constant<RAJA::named_dim, RAJA::named_dim::x>,
+               camp::integral_constant<RAJA::named_dim, RAJA::named_dim::y>,
+               camp::integral_constant<RAJA::named_dim, RAJA::named_dim::z>>;
 
 //
 // List of sizes
 //
 using SizeTypeList =
-    camp::list<
-                camp::integral_constant<int, RAJA::named_usage::ignored>,
-                camp::integral_constant<int, RAJA::named_usage::unspecified>,
-                camp::integral_constant<int, 1>,
-                camp::integral_constant<int, 7>
-              >;
+    camp::list<camp::integral_constant<int, RAJA::named_usage::ignored>,
+               camp::integral_constant<int, RAJA::named_usage::unspecified>,
+               camp::integral_constant<int, 1>,
+               camp::integral_constant<int, 7>>;
 
 //
 // Holder for indexing templates
 //
-template < template < RAJA::named_dim, int, int > class T >
+template <template <RAJA::named_dim, int, int> class T>
 struct indexing_holder
 {
-  template < RAJA::named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
+  template <RAJA::named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
   using type = T<dim, BLOCK_SIZE, GRID_SIZE>;
 };
 
@@ -49,11 +45,13 @@ struct indexing_holder
 // List of indexing holder types
 //
 #if defined(RAJA_ENABLE_CUDA)
-using CudaIndexingHolderList = camp::list< indexing_holder<RAJA::cuda::IndexGlobal> >;
+using CudaIndexingHolderList =
+    camp::list<indexing_holder<RAJA::cuda::IndexGlobal>>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipIndexingHolderList = camp::list< indexing_holder<RAJA::hip::IndexGlobal> >;
+using HipIndexingHolderList =
+    camp::list<indexing_holder<RAJA::hip::IndexGlobal>>;
 #endif
 
 #endif  // __TEST_INDEXING_UTILS_HPP__
diff --git a/test/unit/indexing/tests/test-indexing-global.hpp b/test/unit/indexing/tests/test-indexing-global.hpp
index a345d80067..2fbb4a4421 100644
--- a/test/unit/indexing/tests/test-indexing-global.hpp
+++ b/test/unit/indexing/tests/test-indexing-global.hpp
@@ -18,83 +18,97 @@ template <typename T>
 class IndexingUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( IndexingUnitTest );
+TYPED_TEST_SUITE_P(IndexingUnitTest);
 
-template < typename test_policy,
-           typename indexer_type,
-           RAJA::named_dim dim_012,
-           int BLOCK_SIZE,
-           int GRID_SIZE >
+template <typename test_policy,
+          typename indexer_type,
+          RAJA::named_dim dim_012,
+          int BLOCK_SIZE,
+          int GRID_SIZE>
 void testBasicIndexing()
 {
-  dim3d3d expected_dim{{1,1,1}, {1,1,1}};
-  if (BLOCK_SIZE != RAJA::named_usage::ignored) {
-    if (BLOCK_SIZE == RAJA::named_usage::unspecified) {
+  dim3d3d expected_dim {{1, 1, 1}, {1, 1, 1}};
+  if (BLOCK_SIZE != RAJA::named_usage::ignored)
+  {
+    if (BLOCK_SIZE == RAJA::named_usage::unspecified)
+    {
       expected_dim.thread[static_cast<int>(dim_012)] = 3;
-    } else {
+    }
+    else
+    {
       expected_dim.thread[static_cast<int>(dim_012)] = BLOCK_SIZE;
     }
   }
 
-  if (GRID_SIZE != RAJA::named_usage::ignored) {
-    if (GRID_SIZE == RAJA::named_usage::unspecified) {
+  if (GRID_SIZE != RAJA::named_usage::ignored)
+  {
+    if (GRID_SIZE == RAJA::named_usage::unspecified)
+    {
       expected_dim.block[static_cast<int>(dim_012)] = 5;
-    } else {
+    }
+    else
+    {
       expected_dim.block[static_cast<int>(dim_012)] = GRID_SIZE;
     }
   }
 
   const int total_global = expected_dim.product();
 
-  auto host_res = get_test_resource<test_seq>();
+  auto host_res    = get_test_resource<test_seq>();
   auto working_res = get_test_resource<test_policy>();
 
   int* actual_index = host_res.allocate<int>(total_global);
-  int* actual_size = host_res.allocate<int>(total_global);
+  int* actual_size  = host_res.allocate<int>(total_global);
 
-  for (int i = 0; i < total_global; ++i) {
+  for (int i = 0; i < total_global; ++i)
+  {
     actual_index[i] = -1;
-    actual_size[i] = -1;
+    actual_size[i]  = -1;
   }
 
-  actual_index = test_reallocate(working_res, host_res, actual_index, total_global);
-  actual_size = test_reallocate(working_res, host_res, actual_size, total_global);
+  actual_index =
+      test_reallocate(working_res, host_res, actual_index, total_global);
+  actual_size =
+      test_reallocate(working_res, host_res, actual_size, total_global);
 
   for3d3d<test_policy>(expected_dim,
-      [=] RAJA_HOST_DEVICE (dim3d3d idx, dim3d3d dim) {
-    int i = index(idx, dim);
-    actual_index[i] = indexer_type::template index<int>();
-    actual_size[i] = indexer_type::template size<int>();
-  });
-
-  actual_index = test_reallocate(host_res, working_res, actual_index, total_global);
-  actual_size = test_reallocate(host_res, working_res, actual_size, total_global);
-
-  for (int i = 0; i < total_global; ++i) {
-    ASSERT_EQ( actual_index[i], i );
-    ASSERT_EQ( actual_size[i], total_global );
+                       [=] RAJA_HOST_DEVICE(dim3d3d idx, dim3d3d dim)
+                       {
+                         int i           = index(idx, dim);
+                         actual_index[i] = indexer_type::template index<int>();
+                         actual_size[i]  = indexer_type::template size<int>();
+                       });
+
+  actual_index =
+      test_reallocate(host_res, working_res, actual_index, total_global);
+  actual_size =
+      test_reallocate(host_res, working_res, actual_size, total_global);
+
+  for (int i = 0; i < total_global; ++i)
+  {
+    ASSERT_EQ(actual_index[i], i);
+    ASSERT_EQ(actual_size[i], total_global);
   }
 
   host_res.deallocate(actual_index);
   host_res.deallocate(actual_size);
 }
 
-TYPED_TEST_P( IndexingUnitTest, BasicIndexing )
+TYPED_TEST_P(IndexingUnitTest, BasicIndexing)
 {
-  using test_policy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using test_policy         = typename camp::at<TypeParam, camp::num<0>>::type;
   using indexer_holder_type = typename camp::at<TypeParam, camp::num<1>>::type;
-  using dim_type = typename camp::at<TypeParam, camp::num<2>>::type;
-  using threads_type = typename camp::at<TypeParam, camp::num<3>>::type;
-  using blocks_type = typename camp::at<TypeParam, camp::num<4>>::type;
+  using dim_type            = typename camp::at<TypeParam, camp::num<2>>::type;
+  using threads_type        = typename camp::at<TypeParam, camp::num<3>>::type;
+  using blocks_type         = typename camp::at<TypeParam, camp::num<4>>::type;
 
   using indexer_type = typename indexer_holder_type::template type<
       dim_type::value, threads_type::value, blocks_type::value>;
 
-  testBasicIndexing< test_policy, indexer_type,
-                     dim_type::value, threads_type::value, blocks_type::value >();
+  testBasicIndexing<test_policy, indexer_type, dim_type::value,
+                    threads_type::value, blocks_type::value>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P( IndexingUnitTest,
-                             BasicIndexing );
+REGISTER_TYPED_TEST_SUITE_P(IndexingUnitTest, BasicIndexing);
 
 #endif  //__TEST_INDEXING_GLOBAL__
diff --git a/test/unit/internal/test-iterators.cpp b/test/unit/internal/test-iterators.cpp
index b5eb0ade48..2d90dca4c0 100644
--- a/test/unit/internal/test-iterators.cpp
+++ b/test/unit/internal/test-iterators.cpp
@@ -14,11 +14,13 @@
 
 #include <limits>
 
-template<typename T>
-class NumericIteratorUnitTest : public ::testing::Test {};
+template <typename T>
+class NumericIteratorUnitTest : public ::testing::Test
+{};
 
-template<typename T>
-class StridedNumericIteratorUnitTest : public ::testing::Test {};
+template <typename T>
+class StridedNumericIteratorUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(NumericIteratorUnitTest, UnitExpandedIntegralTypes);
 TYPED_TEST_SUITE(StridedNumericIteratorUnitTest, UnitExpandedIntegralTypes);
@@ -84,7 +86,8 @@ TYPED_TEST(StridedNumericIteratorUnitTest, simple)
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
 TYPED_TEST(NumericIteratorUnitTest, overflow)
 {
-  if (std::is_unsigned<TypeParam>::value) {
+  if (std::is_unsigned<TypeParam>::value)
+  {
     ASSERT_ANY_THROW({
       TypeParam val = 10;
       RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
@@ -95,7 +98,7 @@ TYPED_TEST(NumericIteratorUnitTest, overflow)
       RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
       of_it += 11;
     });
-  
+
     ASSERT_ANY_THROW({
       TypeParam val = 10;
       RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
@@ -108,7 +111,7 @@ TYPED_TEST(NumericIteratorUnitTest, overflow)
       auto sum = of_it + 11;
       (void)sum;
     });
-  
+
     ASSERT_ANY_THROW({
       TypeParam val = 10;
       const RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
@@ -121,12 +124,13 @@ TYPED_TEST(NumericIteratorUnitTest, overflow)
       auto sum = 11 + of_it;
       (void)sum;
     });
-  } 
+  }
 }
 
 TYPED_TEST(StridedNumericIteratorUnitTest, overflow)
 {
-  if (std::is_unsigned<TypeParam>::value){
+  if (std::is_unsigned<TypeParam>::value)
+  {
     ASSERT_ANY_THROW({
       TypeParam val = 2;
       RAJA::Iterators::strided_numeric_iterator<TypeParam> of_it(val, 2);
diff --git a/test/unit/internal/test-rajavec.cpp b/test/unit/internal/test-rajavec.cpp
index edc093b4dd..131bb16c0b 100644
--- a/test/unit/internal/test-rajavec.cpp
+++ b/test/unit/internal/test-rajavec.cpp
@@ -27,8 +27,8 @@ TEST(RAJAVecUnitTest, basic_test)
 
   RAJA::RAJAVec<int> a1(a);
   ASSERT_EQ(a.size(), a1.size());
-  int* a_data = a.data(); 
-  int* a1_data = a1.data(); 
+  int* a_data  = a.data();
+  int* a1_data = a1.data();
   ASSERT_EQ(a_data[0], a1_data[0]);
   ASSERT_EQ(a_data[1], a1_data[1]);
 
diff --git a/test/unit/multi_reducer/test-multi-reducer.hpp b/test/unit/multi_reducer/test-multi-reducer.hpp
index a1f94e0895..965926b144 100644
--- a/test/unit/multi_reducer/test-multi-reducer.hpp
+++ b/test/unit/multi_reducer/test-multi-reducer.hpp
@@ -17,31 +17,31 @@
 //
 // Data types
 //
-using DataTypeList = camp::list< int,
-                                 float,
-                                 double >;
+using DataTypeList = camp::list<int, float, double>;
 
-using SequentialMultiReducerPolicyList = camp::list< RAJA::seq_multi_reduce >;
+using SequentialMultiReducerPolicyList = camp::list<RAJA::seq_multi_reduce>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPMultiReducerPolicyList = camp::list< RAJA::omp_multi_reduce,
-                                                 RAJA::omp_multi_reduce_ordered >;
+using OpenMPMultiReducerPolicyList =
+    camp::list<RAJA::omp_multi_reduce, RAJA::omp_multi_reduce_ordered>;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaMultiReducerPolicyList =
-  camp::list< RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init,
-              RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
-              RAJA::cuda_multi_reduce_atomic_global_host_init,
-              RAJA::cuda_multi_reduce_atomic_global_no_replication_host_init >;
+using CudaMultiReducerPolicyList = camp::list<
+    RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init,
+    RAJA::
+        cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
+    RAJA::cuda_multi_reduce_atomic_global_host_init,
+    RAJA::cuda_multi_reduce_atomic_global_no_replication_host_init>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipMultiReducerPolicyList =
-  camp::list< RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init,
-              RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
-              RAJA::hip_multi_reduce_atomic_global_host_init,
-              RAJA::hip_multi_reduce_atomic_global_no_replication_host_init  >;
+using HipMultiReducerPolicyList = camp::list<
+    RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init,
+    RAJA::
+        hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
+    RAJA::hip_multi_reduce_atomic_global_host_init,
+    RAJA::hip_multi_reduce_atomic_global_no_replication_host_init>;
 #endif
 
 #endif  // __TEST_MULTI_REDUCER_UTILS_HPP__
diff --git a/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp b/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp
index 1104ae1e28..e84f21a475 100644
--- a/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp
+++ b/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp
@@ -6,7 +6,8 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Header file containing tests for RAJA multi reducer constructors and initialization.
+/// Header file containing tests for RAJA multi reducer constructors and
+/// initialization.
 ///
 
 #ifndef __TEST_MULTI_REDUCER_CONSTRUCTOR__
@@ -22,63 +23,70 @@
 
 template <typename T>
 class MultiReducerBasicConstructorUnitTest : public ::testing::Test
-{
-};
+{};
 
 template <typename T>
 class MultiReducerSingleInitConstructorUnitTest : public ::testing::Test
-{
-};
+{};
 
 template <typename T>
 class MultiReducerContainerInitConstructorUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(MultiReducerBasicConstructorUnitTest);
 TYPED_TEST_SUITE_P(MultiReducerSingleInitConstructorUnitTest);
 TYPED_TEST_SUITE_P(MultiReducerContainerInitConstructorUnitTest);
 
 
-template <typename MultiReducePolicy,
-          typename NumericType>
+template <typename MultiReducePolicy, typename NumericType>
 void testBasicMultiReducerConstructorRegular(size_t num_bins)
 {
-  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(num_bins);
-  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(num_bins);
-  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(num_bins);
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
+      num_bins);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(
+      num_bins);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(
+      num_bins);
 
   ASSERT_EQ(multi_reduce_sum.size(), num_bins);
   ASSERT_EQ(multi_reduce_min.size(), num_bins);
   ASSERT_EQ(multi_reduce_max.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin) {
+  for (size_t bin = 0; bin < num_bins; ++bin)
+  {
     ASSERT_EQ(multi_reduce_sum.get(bin), get_op_identity(multi_reduce_sum));
     ASSERT_EQ(multi_reduce_min.get(bin), get_op_identity(multi_reduce_min));
     ASSERT_EQ(multi_reduce_max.get(bin), get_op_identity(multi_reduce_max));
 
-    ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(), get_op_identity(multi_reduce_sum));
-    ASSERT_EQ((NumericType)multi_reduce_min[bin].get(), get_op_identity(multi_reduce_min));
-    ASSERT_EQ((NumericType)multi_reduce_max[bin].get(), get_op_identity(multi_reduce_max));
+    ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(),
+              get_op_identity(multi_reduce_sum));
+    ASSERT_EQ((NumericType)multi_reduce_min[bin].get(),
+              get_op_identity(multi_reduce_min));
+    ASSERT_EQ((NumericType)multi_reduce_max[bin].get(),
+              get_op_identity(multi_reduce_max));
   }
 }
 
-template <typename MultiReducePolicy,
-          typename NumericType>
+template <typename MultiReducePolicy, typename NumericType>
 void testBasicMultiReducerConstructorBitwise(size_t num_bins)
 {
-  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(num_bins);
-  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(num_bins);
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(
+      num_bins);
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(
+      num_bins);
 
   ASSERT_EQ(multi_reduce_or.size(), num_bins);
   ASSERT_EQ(multi_reduce_and.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin) {
+  for (size_t bin = 0; bin < num_bins; ++bin)
+  {
     ASSERT_EQ(multi_reduce_or.get(bin), get_op_identity(multi_reduce_or));
     ASSERT_EQ(multi_reduce_and.get(bin), get_op_identity(multi_reduce_and));
 
-    ASSERT_EQ((NumericType)multi_reduce_or[bin].get(), get_op_identity(multi_reduce_or));
-    ASSERT_EQ((NumericType)multi_reduce_and[bin].get(), get_op_identity(multi_reduce_and));
+    ASSERT_EQ((NumericType)multi_reduce_or[bin].get(),
+              get_op_identity(multi_reduce_or));
+    ASSERT_EQ((NumericType)multi_reduce_and[bin].get(),
+              get_op_identity(multi_reduce_and));
   }
 }
 
@@ -87,8 +95,10 @@ template <typename MultiReducePolicy,
           std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
 void testBasicMultiReducerConstructor(size_t num_bins)
 {
-  testBasicMultiReducerConstructorRegular< MultiReducePolicy, NumericType >(num_bins);
-  testBasicMultiReducerConstructorBitwise< MultiReducePolicy, NumericType >(num_bins);
+  testBasicMultiReducerConstructorRegular<MultiReducePolicy, NumericType>(
+      num_bins);
+  testBasicMultiReducerConstructorBitwise<MultiReducePolicy, NumericType>(
+      num_bins);
 }
 ///
 template <typename MultiReducePolicy,
@@ -96,34 +106,39 @@ template <typename MultiReducePolicy,
           std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
 void testBasicMultiReducerConstructor(size_t num_bins)
 {
-  testBasicMultiReducerConstructorRegular< MultiReducePolicy, NumericType >(num_bins);
+  testBasicMultiReducerConstructorRegular<MultiReducePolicy, NumericType>(
+      num_bins);
 }
 
 TYPED_TEST_P(MultiReducerBasicConstructorUnitTest, MultiReducerConstructor)
 {
   using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using NumericType       = typename camp::at<TypeParam, camp::num<1>>::type;
 
-  testBasicMultiReducerConstructor< MultiReducePolicy, NumericType >(0);
-  testBasicMultiReducerConstructor< MultiReducePolicy, NumericType >(1);
-  testBasicMultiReducerConstructor< MultiReducePolicy, NumericType >(2);
-  testBasicMultiReducerConstructor< MultiReducePolicy, NumericType >(10);
+  testBasicMultiReducerConstructor<MultiReducePolicy, NumericType>(0);
+  testBasicMultiReducerConstructor<MultiReducePolicy, NumericType>(1);
+  testBasicMultiReducerConstructor<MultiReducePolicy, NumericType>(2);
+  testBasicMultiReducerConstructor<MultiReducePolicy, NumericType>(10);
 }
 
 
-template <typename MultiReducePolicy,
-          typename NumericType>
-void testMultiReducerSingleInitConstructorRegular(size_t num_bins, NumericType initVal)
+template <typename MultiReducePolicy, typename NumericType>
+void testMultiReducerSingleInitConstructorRegular(size_t num_bins,
+                                                  NumericType initVal)
 {
-  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(num_bins, initVal);
-  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(num_bins, initVal);
-  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(num_bins, initVal);
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
+      num_bins, initVal);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(
+      num_bins, initVal);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(
+      num_bins, initVal);
 
   ASSERT_EQ(multi_reduce_sum.size(), num_bins);
   ASSERT_EQ(multi_reduce_min.size(), num_bins);
   ASSERT_EQ(multi_reduce_max.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin) {
+  for (size_t bin = 0; bin < num_bins; ++bin)
+  {
     ASSERT_EQ(multi_reduce_sum.get(bin), initVal);
     ASSERT_EQ(multi_reduce_min.get(bin), initVal);
     ASSERT_EQ(multi_reduce_max.get(bin), initVal);
@@ -134,17 +149,20 @@ void testMultiReducerSingleInitConstructorRegular(size_t num_bins, NumericType i
   }
 }
 
-template <typename MultiReducePolicy,
-          typename NumericType>
-void testMultiReducerSingleInitConstructorBitwise(size_t num_bins, NumericType initVal)
+template <typename MultiReducePolicy, typename NumericType>
+void testMultiReducerSingleInitConstructorBitwise(size_t num_bins,
+                                                  NumericType initVal)
 {
-  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(num_bins, initVal);
-  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(num_bins, initVal);
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(
+      num_bins, initVal);
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(
+      num_bins, initVal);
 
   ASSERT_EQ(multi_reduce_or.size(), num_bins);
   ASSERT_EQ(multi_reduce_and.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin) {
+  for (size_t bin = 0; bin < num_bins; ++bin)
+  {
     ASSERT_EQ(multi_reduce_or.get(bin), initVal);
     ASSERT_EQ(multi_reduce_and.get(bin), initVal);
 
@@ -155,48 +173,57 @@ void testMultiReducerSingleInitConstructorBitwise(size_t num_bins, NumericType i
 
 template <typename MultiReducePolicy,
           typename NumericType,
-          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr >
+          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerSingleInitConstructor(size_t num_bins, NumericType initVal)
 {
-  testMultiReducerSingleInitConstructorRegular< MultiReducePolicy, NumericType >(num_bins, initVal);
-  testMultiReducerSingleInitConstructorBitwise< MultiReducePolicy, NumericType >(num_bins, initVal);
+  testMultiReducerSingleInitConstructorRegular<MultiReducePolicy, NumericType>(
+      num_bins, initVal);
+  testMultiReducerSingleInitConstructorBitwise<MultiReducePolicy, NumericType>(
+      num_bins, initVal);
 }
 ///
 template <typename MultiReducePolicy,
           typename NumericType,
-          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr >
+          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerSingleInitConstructor(size_t num_bins, NumericType initVal)
 {
-  testMultiReducerSingleInitConstructorRegular< MultiReducePolicy, NumericType >(num_bins, initVal);
+  testMultiReducerSingleInitConstructorRegular<MultiReducePolicy, NumericType>(
+      num_bins, initVal);
 }
 
 TYPED_TEST_P(MultiReducerSingleInitConstructorUnitTest, MultiReducerConstructor)
 {
   using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
-
-  testMultiReducerSingleInitConstructor< MultiReducePolicy, NumericType >(0, NumericType(2));
-  testMultiReducerSingleInitConstructor< MultiReducePolicy, NumericType >(1, NumericType(4));
-  testMultiReducerSingleInitConstructor< MultiReducePolicy, NumericType >(2, NumericType(0));
-  testMultiReducerSingleInitConstructor< MultiReducePolicy, NumericType >(10, NumericType(9));
+  using NumericType       = typename camp::at<TypeParam, camp::num<1>>::type;
+
+  testMultiReducerSingleInitConstructor<MultiReducePolicy, NumericType>(
+      0, NumericType(2));
+  testMultiReducerSingleInitConstructor<MultiReducePolicy, NumericType>(
+      1, NumericType(4));
+  testMultiReducerSingleInitConstructor<MultiReducePolicy, NumericType>(
+      2, NumericType(0));
+  testMultiReducerSingleInitConstructor<MultiReducePolicy, NumericType>(
+      10, NumericType(9));
 }
 
 
-template <typename MultiReducePolicy,
-          typename NumericType,
-          typename Container>
+template <typename MultiReducePolicy, typename NumericType, typename Container>
 void testMultiReducerContainerInitConstructorRegular(Container const& container)
 {
-  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(container);
-  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(container);
-  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(container);
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
+      container);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(
+      container);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(
+      container);
 
   ASSERT_EQ(multi_reduce_sum.size(), container.size());
   ASSERT_EQ(multi_reduce_min.size(), container.size());
   ASSERT_EQ(multi_reduce_max.size(), container.size());
 
   size_t bin = 0;
-  for (NumericType val : container) {
+  for (NumericType val : container)
+  {
     ASSERT_EQ(multi_reduce_sum.get(bin), val);
     ASSERT_EQ(multi_reduce_min.get(bin), val);
     ASSERT_EQ(multi_reduce_max.get(bin), val);
@@ -208,19 +235,20 @@ void testMultiReducerContainerInitConstructorRegular(Container const& container)
   }
 }
 
-template <typename MultiReducePolicy,
-          typename NumericType,
-          typename Container>
+template <typename MultiReducePolicy, typename NumericType, typename Container>
 void testMultiReducerContainerInitConstructorBitwise(Container const& container)
 {
-  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(container);
-  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(container);
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(
+      container);
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(
+      container);
 
   ASSERT_EQ(multi_reduce_and.size(), container.size());
   ASSERT_EQ(multi_reduce_or.size(), container.size());
 
   size_t bin = 0;
-  for (NumericType val : container) {
+  for (NumericType val : container)
+  {
     ASSERT_EQ(multi_reduce_and.get(bin), val);
     ASSERT_EQ(multi_reduce_or.get(bin), val);
 
@@ -236,8 +264,10 @@ template <typename MultiReducePolicy,
           std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerContainerInitConstructor(Container const& container)
 {
-  testMultiReducerContainerInitConstructorRegular< MultiReducePolicy, NumericType >(container);
-  testMultiReducerContainerInitConstructorBitwise< MultiReducePolicy, NumericType >(container);
+  testMultiReducerContainerInitConstructorRegular<MultiReducePolicy,
+                                                  NumericType>(container);
+  testMultiReducerContainerInitConstructorBitwise<MultiReducePolicy,
+                                                  NumericType>(container);
 }
 ///
 template <typename MultiReducePolicy,
@@ -246,13 +276,15 @@ template <typename MultiReducePolicy,
           std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerContainerInitConstructor(Container const& container)
 {
-  testMultiReducerContainerInitConstructorRegular< MultiReducePolicy, NumericType >(container);
+  testMultiReducerContainerInitConstructorRegular<MultiReducePolicy,
+                                                  NumericType>(container);
 }
 
-TYPED_TEST_P(MultiReducerContainerInitConstructorUnitTest, MultiReducerConstructor)
+TYPED_TEST_P(MultiReducerContainerInitConstructorUnitTest,
+             MultiReducerConstructor)
 {
   using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using NumericType       = typename camp::at<TypeParam, camp::num<1>>::type;
 
   std::vector<NumericType> c0(0);
   std::vector<NumericType> c1(1, 3);
@@ -260,13 +292,14 @@ TYPED_TEST_P(MultiReducerContainerInitConstructorUnitTest, MultiReducerConstruct
   c2.emplace(5);
   c2.emplace(8);
   std::list<NumericType> c10;
-  for (size_t bin = 0; bin < size_t(10); ++bin) {
+  for (size_t bin = 0; bin < size_t(10); ++bin)
+  {
     c10.emplace_front(NumericType(bin));
   }
-  testMultiReducerContainerInitConstructor< MultiReducePolicy, NumericType >(c0);
-  testMultiReducerContainerInitConstructor< MultiReducePolicy, NumericType >(c1);
-  testMultiReducerContainerInitConstructor< MultiReducePolicy, NumericType >(c2);
-  testMultiReducerContainerInitConstructor< MultiReducePolicy, NumericType >(c10);
+  testMultiReducerContainerInitConstructor<MultiReducePolicy, NumericType>(c0);
+  testMultiReducerContainerInitConstructor<MultiReducePolicy, NumericType>(c1);
+  testMultiReducerContainerInitConstructor<MultiReducePolicy, NumericType>(c2);
+  testMultiReducerContainerInitConstructor<MultiReducePolicy, NumericType>(c10);
 }
 
 
diff --git a/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp b/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp
index 0eb1eb6eb6..379cdbd6fc 100644
--- a/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp
+++ b/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp
@@ -22,44 +22,45 @@
 
 template <typename T>
 class MultiReducerBasicResetUnitTest : public ::testing::Test
-{
-};
+{};
 
 template <typename T>
 class MultiReducerSingleResetUnitTest : public ::testing::Test
-{
-};
+{};
 
 template <typename T>
 class MultiReducerContainerResetUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(MultiReducerBasicResetUnitTest);
 TYPED_TEST_SUITE_P(MultiReducerSingleResetUnitTest);
 TYPED_TEST_SUITE_P(MultiReducerContainerResetUnitTest);
 
 
-
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol  >
+template <typename MultiReducePolicy, typename NumericType, typename ForOnePol>
 void testMultiReducerBasicResetRegular(bool use_reducer, size_t num_bins)
 {
   NumericType initVal = NumericType(5);
 
-  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(num_bins, initVal);
-  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(num_bins, initVal);
-  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(num_bins, initVal);
-
-  if (use_reducer) {
-    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
-      for (size_t bin = 0; bin < num_bins; ++bin) {
-        multi_reduce_sum[bin] += initVal;
-        multi_reduce_min[bin].min(initVal-1);
-        multi_reduce_max[bin].max(initVal+1);
-      }
-    });
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
+      num_bins, initVal);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(
+      num_bins, initVal);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(
+      num_bins, initVal);
+
+  if (use_reducer)
+  {
+    forone<ForOnePol>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          for (size_t bin = 0; bin < num_bins; ++bin)
+          {
+            multi_reduce_sum[bin] += initVal;
+            multi_reduce_min[bin].min(initVal - 1);
+            multi_reduce_max[bin].max(initVal + 1);
+          }
+        });
   }
 
   multi_reduce_sum.reset();
@@ -70,34 +71,42 @@ void testMultiReducerBasicResetRegular(bool use_reducer, size_t num_bins)
   ASSERT_EQ(multi_reduce_min.size(), num_bins);
   ASSERT_EQ(multi_reduce_max.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin) {
+  for (size_t bin = 0; bin < num_bins; ++bin)
+  {
     ASSERT_EQ(multi_reduce_sum.get(bin), get_op_identity(multi_reduce_sum));
     ASSERT_EQ(multi_reduce_min.get(bin), get_op_identity(multi_reduce_min));
     ASSERT_EQ(multi_reduce_max.get(bin), get_op_identity(multi_reduce_max));
 
-    ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(), get_op_identity(multi_reduce_sum));
-    ASSERT_EQ((NumericType)multi_reduce_min[bin].get(), get_op_identity(multi_reduce_min));
-    ASSERT_EQ((NumericType)multi_reduce_max[bin].get(), get_op_identity(multi_reduce_max));
+    ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(),
+              get_op_identity(multi_reduce_sum));
+    ASSERT_EQ((NumericType)multi_reduce_min[bin].get(),
+              get_op_identity(multi_reduce_min));
+    ASSERT_EQ((NumericType)multi_reduce_max[bin].get(),
+              get_op_identity(multi_reduce_max));
   }
 }
 
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol  >
+template <typename MultiReducePolicy, typename NumericType, typename ForOnePol>
 void testMultiReducerBasicResetBitwise(bool use_reducer, size_t num_bins)
 {
   NumericType initVal = NumericType(5);
 
-  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(num_bins, initVal);
-  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(num_bins, initVal);
-
-  if (use_reducer) {
-    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
-      for (size_t bin = 0; bin < num_bins; ++bin) {
-        multi_reduce_and[bin] &= initVal-1;
-        multi_reduce_or[bin] |= initVal+1;
-      }
-    });
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(
+      num_bins, initVal);
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(
+      num_bins, initVal);
+
+  if (use_reducer)
+  {
+    forone<ForOnePol>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          for (size_t bin = 0; bin < num_bins; ++bin)
+          {
+            multi_reduce_and[bin] &= initVal - 1;
+            multi_reduce_or[bin] |= initVal + 1;
+          }
+        });
   }
 
   multi_reduce_and.reset();
@@ -106,72 +115,87 @@ void testMultiReducerBasicResetBitwise(bool use_reducer, size_t num_bins)
   ASSERT_EQ(multi_reduce_and.size(), num_bins);
   ASSERT_EQ(multi_reduce_or.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin) {
+  for (size_t bin = 0; bin < num_bins; ++bin)
+  {
     ASSERT_EQ(multi_reduce_and.get(bin), get_op_identity(multi_reduce_and));
     ASSERT_EQ(multi_reduce_or.get(bin), get_op_identity(multi_reduce_or));
 
-    ASSERT_EQ((NumericType)multi_reduce_and[bin].get(), get_op_identity(multi_reduce_and));
-    ASSERT_EQ((NumericType)multi_reduce_or[bin].get(), get_op_identity(multi_reduce_or));
+    ASSERT_EQ((NumericType)multi_reduce_and[bin].get(),
+              get_op_identity(multi_reduce_and));
+    ASSERT_EQ((NumericType)multi_reduce_or[bin].get(),
+              get_op_identity(multi_reduce_or));
   }
 }
 
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol,
-            std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr >
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerBasicReset(size_t num_bins)
 {
-  testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, num_bins);
-  testMultiReducerBasicResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(false, num_bins);
+  testMultiReducerBasicResetRegular<MultiReducePolicy, NumericType, ForOnePol>(
+      false, num_bins);
+  testMultiReducerBasicResetBitwise<MultiReducePolicy, NumericType, ForOnePol>(
+      false, num_bins);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
-  // testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, num_bins);
-  // testMultiReducerBasicResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(true, num_bins);
+  // testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType,
+  // ForOnePol >(true, num_bins); testMultiReducerBasicResetBitwise<
+  // MultiReducePolicy, NumericType, ForOnePol >(true, num_bins);
 }
 ///
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol,
-            std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr >
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerBasicReset(size_t num_bins)
 {
-  testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, num_bins);
+  testMultiReducerBasicResetRegular<MultiReducePolicy, NumericType, ForOnePol>(
+      false, num_bins);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
-  // testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, num_bins);
+  // testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType,
+  // ForOnePol >(true, num_bins);
 }
 
 TYPED_TEST_P(MultiReducerBasicResetUnitTest, MultiReducerReset)
 {
   using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
-  using ForOnePol = typename camp::at<TypeParam, camp::num<2>>::type;
+  using NumericType       = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ForOnePol         = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  testMultiReducerBasicReset< MultiReducePolicy, NumericType, ForOnePol >(0);
-  testMultiReducerBasicReset< MultiReducePolicy, NumericType, ForOnePol >(1);
-  testMultiReducerBasicReset< MultiReducePolicy, NumericType, ForOnePol >(2);
-  testMultiReducerBasicReset< MultiReducePolicy, NumericType, ForOnePol >(10);
+  testMultiReducerBasicReset<MultiReducePolicy, NumericType, ForOnePol>(0);
+  testMultiReducerBasicReset<MultiReducePolicy, NumericType, ForOnePol>(1);
+  testMultiReducerBasicReset<MultiReducePolicy, NumericType, ForOnePol>(2);
+  testMultiReducerBasicReset<MultiReducePolicy, NumericType, ForOnePol>(10);
 }
 
 
-
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol  >
-void testMultiReducerSingleResetRegular(bool use_reducer, size_t init_bins, size_t num_bins, NumericType initVal)
+template <typename MultiReducePolicy, typename NumericType, typename ForOnePol>
+void testMultiReducerSingleResetRegular(bool use_reducer,
+                                        size_t init_bins,
+                                        size_t num_bins,
+                                        NumericType initVal)
 {
-  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(init_bins, initVal);
-  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(init_bins, initVal);
-  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(init_bins, initVal);
-
-  if (use_reducer) {
-    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
-      for (size_t bin = 0; bin < init_bins; ++bin) {
-        multi_reduce_sum[bin] += initVal;
-        multi_reduce_min[bin].min(initVal-1);
-        multi_reduce_max[bin].max(initVal+1);
-      }
-    });
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
+      init_bins, initVal);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(
+      init_bins, initVal);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(
+      init_bins, initVal);
+
+  if (use_reducer)
+  {
+    forone<ForOnePol>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          for (size_t bin = 0; bin < init_bins; ++bin)
+          {
+            multi_reduce_sum[bin] += initVal;
+            multi_reduce_min[bin].min(initVal - 1);
+            multi_reduce_max[bin].max(initVal + 1);
+          }
+        });
   }
 
   multi_reduce_sum.reset(num_bins, initVal);
@@ -182,7 +206,8 @@ void testMultiReducerSingleResetRegular(bool use_reducer, size_t init_bins, size
   ASSERT_EQ(multi_reduce_min.size(), num_bins);
   ASSERT_EQ(multi_reduce_max.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin) {
+  for (size_t bin = 0; bin < num_bins; ++bin)
+  {
     ASSERT_EQ(multi_reduce_sum.get(bin), initVal);
     ASSERT_EQ(multi_reduce_min.get(bin), initVal);
     ASSERT_EQ(multi_reduce_max.get(bin), initVal);
@@ -193,21 +218,28 @@ void testMultiReducerSingleResetRegular(bool use_reducer, size_t init_bins, size
   }
 }
 
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol  >
-void testMultiReducerSingleResetBitwise(bool use_reducer, size_t init_bins, size_t num_bins, NumericType initVal)
+template <typename MultiReducePolicy, typename NumericType, typename ForOnePol>
+void testMultiReducerSingleResetBitwise(bool use_reducer,
+                                        size_t init_bins,
+                                        size_t num_bins,
+                                        NumericType initVal)
 {
-  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(init_bins, initVal);
-  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(init_bins, initVal);
-
-  if (use_reducer) {
-    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
-      for (size_t bin = 0; bin < init_bins; ++bin) {
-        multi_reduce_and[bin] &= initVal-1;
-        multi_reduce_or[bin] |= initVal+1;
-      }
-    });
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(
+      init_bins, initVal);
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(
+      init_bins, initVal);
+
+  if (use_reducer)
+  {
+    forone<ForOnePol>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          for (size_t bin = 0; bin < init_bins; ++bin)
+          {
+            multi_reduce_and[bin] &= initVal - 1;
+            multi_reduce_or[bin] |= initVal + 1;
+          }
+        });
   }
 
   multi_reduce_and.reset(num_bins, initVal);
@@ -216,7 +248,8 @@ void testMultiReducerSingleResetBitwise(bool use_reducer, size_t init_bins, size
   ASSERT_EQ(multi_reduce_and.size(), num_bins);
   ASSERT_EQ(multi_reduce_or.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin) {
+  for (size_t bin = 0; bin < num_bins; ++bin)
+  {
     ASSERT_EQ(multi_reduce_and.get(bin), initVal);
     ASSERT_EQ(multi_reduce_or.get(bin), initVal);
 
@@ -225,77 +258,100 @@ void testMultiReducerSingleResetBitwise(bool use_reducer, size_t init_bins, size
   }
 }
 
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol,
-            std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr >
-void testMultiReducerSingleResetSize(size_t init_bins, size_t num_bins, NumericType initVal)
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
+void testMultiReducerSingleResetSize(size_t init_bins,
+                                     size_t num_bins,
+                                     NumericType initVal)
 {
-  testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, num_bins, initVal);
-  testMultiReducerSingleResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, num_bins, initVal);
+  testMultiReducerSingleResetRegular<MultiReducePolicy, NumericType, ForOnePol>(
+      false, init_bins, num_bins, initVal);
+  testMultiReducerSingleResetBitwise<MultiReducePolicy, NumericType, ForOnePol>(
+      false, init_bins, num_bins, initVal);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
-  // testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, num_bins, initVal);
-  // testMultiReducerSingleResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, num_bins, initVal);
+  // testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType,
+  // ForOnePol >(true, init_bins, num_bins, initVal);
+  // testMultiReducerSingleResetBitwise< MultiReducePolicy, NumericType,
+  // ForOnePol >(true, init_bins, num_bins, initVal);
 }
 ///
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol,
-            std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr >
-void testMultiReducerSingleResetSize(size_t init_bins, size_t num_bins, NumericType initVal)
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
+void testMultiReducerSingleResetSize(size_t init_bins,
+                                     size_t num_bins,
+                                     NumericType initVal)
 {
-  testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, num_bins, initVal);
+  testMultiReducerSingleResetRegular<MultiReducePolicy, NumericType, ForOnePol>(
+      false, init_bins, num_bins, initVal);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
-  // testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, num_bins, initVal);
+  // testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType,
+  // ForOnePol >(true, init_bins, num_bins, initVal);
 }
 
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol >
+template <typename MultiReducePolicy, typename NumericType, typename ForOnePol>
 void testMultiReducerSingleReset(size_t num_bins, NumericType initVal)
 {
-  testMultiReducerSingleResetSize< MultiReducePolicy, NumericType, ForOnePol >(0, num_bins, initVal);
-  testMultiReducerSingleResetSize< MultiReducePolicy, NumericType, ForOnePol >(4, num_bins, initVal);
-  testMultiReducerSingleResetSize< MultiReducePolicy, NumericType, ForOnePol >(num_bins, num_bins, initVal);
+  testMultiReducerSingleResetSize<MultiReducePolicy, NumericType, ForOnePol>(
+      0, num_bins, initVal);
+  testMultiReducerSingleResetSize<MultiReducePolicy, NumericType, ForOnePol>(
+      4, num_bins, initVal);
+  testMultiReducerSingleResetSize<MultiReducePolicy, NumericType, ForOnePol>(
+      num_bins, num_bins, initVal);
 }
 
 TYPED_TEST_P(MultiReducerSingleResetUnitTest, MultiReducerReset)
 {
   using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
-  using ForOnePol = typename camp::at<TypeParam, camp::num<2>>::type;
-
-  testMultiReducerSingleReset< MultiReducePolicy, NumericType, ForOnePol >(0, NumericType(3));
-  testMultiReducerSingleReset< MultiReducePolicy, NumericType, ForOnePol >(1, NumericType(5));
-  testMultiReducerSingleReset< MultiReducePolicy, NumericType, ForOnePol >(2, NumericType(0));
-  testMultiReducerSingleReset< MultiReducePolicy, NumericType, ForOnePol >(10, NumericType(8));
+  using NumericType       = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ForOnePol         = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  testMultiReducerSingleReset<MultiReducePolicy, NumericType, ForOnePol>(
+      0, NumericType(3));
+  testMultiReducerSingleReset<MultiReducePolicy, NumericType, ForOnePol>(
+      1, NumericType(5));
+  testMultiReducerSingleReset<MultiReducePolicy, NumericType, ForOnePol>(
+      2, NumericType(0));
+  testMultiReducerSingleReset<MultiReducePolicy, NumericType, ForOnePol>(
+      10, NumericType(8));
 }
 
 
-
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol,
-            typename Container  >
-void testMultiReducerContainerResetRegular(bool use_reducer, size_t init_bins, Container const& container)
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          typename Container>
+void testMultiReducerContainerResetRegular(bool use_reducer,
+                                           size_t init_bins,
+                                           Container const& container)
 {
   const size_t num_bins = container.size();
-  NumericType initVal = NumericType(5);
-
-  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(init_bins, initVal);
-  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(init_bins, initVal);
-  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(init_bins, initVal);
-
-  if (use_reducer) {
-    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
-      for (size_t bin = 0; bin < init_bins; ++bin) {
-        multi_reduce_sum[bin] += initVal;
-        multi_reduce_min[bin].min(initVal-1);
-        multi_reduce_max[bin].max(initVal+1);
-      }
-    });
+  NumericType initVal   = NumericType(5);
+
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
+      init_bins, initVal);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(
+      init_bins, initVal);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(
+      init_bins, initVal);
+
+  if (use_reducer)
+  {
+    forone<ForOnePol>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          for (size_t bin = 0; bin < init_bins; ++bin)
+          {
+            multi_reduce_sum[bin] += initVal;
+            multi_reduce_min[bin].min(initVal - 1);
+            multi_reduce_max[bin].max(initVal + 1);
+          }
+        });
   }
 
   multi_reduce_sum.reset(container);
@@ -307,7 +363,8 @@ void testMultiReducerContainerResetRegular(bool use_reducer, size_t init_bins, C
   ASSERT_EQ(multi_reduce_max.size(), num_bins);
 
   size_t bin = 0;
-  for (NumericType val : container) {
+  for (NumericType val : container)
+  {
     ASSERT_EQ(multi_reduce_sum.get(bin), val);
     ASSERT_EQ(multi_reduce_min.get(bin), val);
     ASSERT_EQ(multi_reduce_max.get(bin), val);
@@ -319,25 +376,33 @@ void testMultiReducerContainerResetRegular(bool use_reducer, size_t init_bins, C
   }
 }
 
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol,
-            typename Container >
-void testMultiReducerContainerResetBitwise(bool use_reducer, size_t init_bins, Container const& container)
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          typename Container>
+void testMultiReducerContainerResetBitwise(bool use_reducer,
+                                           size_t init_bins,
+                                           Container const& container)
 {
   const size_t num_bins = container.size();
-  NumericType initVal = NumericType(5);
-
-  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(init_bins, initVal);
-  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(init_bins, initVal);
-
-  if (use_reducer) {
-    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
-      for (size_t bin = 0; bin < init_bins; ++bin) {
-        multi_reduce_and[bin] &= initVal-1;
-        multi_reduce_or[bin] |= initVal+1;
-      }
-    });
+  NumericType initVal   = NumericType(5);
+
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(
+      init_bins, initVal);
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(
+      init_bins, initVal);
+
+  if (use_reducer)
+  {
+    forone<ForOnePol>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          for (size_t bin = 0; bin < init_bins; ++bin)
+          {
+            multi_reduce_and[bin] &= initVal - 1;
+            multi_reduce_or[bin] |= initVal + 1;
+          }
+        });
   }
 
   multi_reduce_and.reset(container);
@@ -347,7 +412,8 @@ void testMultiReducerContainerResetBitwise(bool use_reducer, size_t init_bins, C
   ASSERT_EQ(multi_reduce_or.size(), num_bins);
 
   size_t bin = 0;
-  for (NumericType val : container) {
+  for (NumericType val : container)
+  {
     ASSERT_EQ(multi_reduce_and.get(bin), val);
     ASSERT_EQ(multi_reduce_or.get(bin), val);
 
@@ -357,50 +423,61 @@ void testMultiReducerContainerResetBitwise(bool use_reducer, size_t init_bins, C
   }
 }
 
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol,
-            typename Container,
-            std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr >
-void testMultiReducerContainerResetSize(size_t init_bins, Container const& container)
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          typename Container,
+          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
+void testMultiReducerContainerResetSize(size_t init_bins,
+                                        Container const& container)
 {
-  testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, container);
-  testMultiReducerContainerResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, container);
+  testMultiReducerContainerResetRegular<MultiReducePolicy, NumericType,
+                                        ForOnePol>(false, init_bins, container);
+  testMultiReducerContainerResetBitwise<MultiReducePolicy, NumericType,
+                                        ForOnePol>(false, init_bins, container);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
-  // testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, container);
-  // testMultiReducerContainerResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, container);
+  // testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType,
+  // ForOnePol >(true, init_bins, container);
+  // testMultiReducerContainerResetBitwise< MultiReducePolicy, NumericType,
+  // ForOnePol >(true, init_bins, container);
 }
 ///
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol,
-            typename Container,
-            std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr >
-void testMultiReducerContainerResetSize(size_t init_bins, Container const& container)
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          typename Container,
+          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
+void testMultiReducerContainerResetSize(size_t init_bins,
+                                        Container const& container)
 {
-  testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, container);
+  testMultiReducerContainerResetRegular<MultiReducePolicy, NumericType,
+                                        ForOnePol>(false, init_bins, container);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
-  // testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, container);
+  // testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType,
+  // ForOnePol >(true, init_bins, container);
 }
 
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol,
-            typename Container >
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          typename Container>
 void testMultiReducerContainerReset(Container const& container)
 {
-  testMultiReducerContainerResetSize< MultiReducePolicy, NumericType, ForOnePol >(0, container);
-  testMultiReducerContainerResetSize< MultiReducePolicy, NumericType, ForOnePol >(4, container);
-  testMultiReducerContainerResetSize< MultiReducePolicy, NumericType, ForOnePol >(container.size(), container);
+  testMultiReducerContainerResetSize<MultiReducePolicy, NumericType, ForOnePol>(
+      0, container);
+  testMultiReducerContainerResetSize<MultiReducePolicy, NumericType, ForOnePol>(
+      4, container);
+  testMultiReducerContainerResetSize<MultiReducePolicy, NumericType, ForOnePol>(
+      container.size(), container);
 }
 
 TYPED_TEST_P(MultiReducerContainerResetUnitTest, MultiReducerReset)
 {
   using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
-  using ForOnePol = typename camp::at<TypeParam, camp::num<2>>::type;
+  using NumericType       = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ForOnePol         = typename camp::at<TypeParam, camp::num<2>>::type;
 
   std::vector<NumericType> c0(0);
   std::vector<NumericType> c1(1, 3);
@@ -408,22 +485,21 @@ TYPED_TEST_P(MultiReducerContainerResetUnitTest, MultiReducerReset)
   c2.emplace(5);
   c2.emplace(8);
   std::list<NumericType> c10;
-  for (size_t bin = 0; bin < size_t(10); ++bin) {
+  for (size_t bin = 0; bin < size_t(10); ++bin)
+  {
     c10.emplace_front(NumericType(bin));
   }
-  testMultiReducerContainerReset< MultiReducePolicy, NumericType, ForOnePol >(c0);
-  testMultiReducerContainerReset< MultiReducePolicy, NumericType, ForOnePol >(c1);
-  testMultiReducerContainerReset< MultiReducePolicy, NumericType, ForOnePol >(c2);
-  testMultiReducerContainerReset< MultiReducePolicy, NumericType, ForOnePol >(c10);
+  testMultiReducerContainerReset<MultiReducePolicy, NumericType, ForOnePol>(c0);
+  testMultiReducerContainerReset<MultiReducePolicy, NumericType, ForOnePol>(c1);
+  testMultiReducerContainerReset<MultiReducePolicy, NumericType, ForOnePol>(c2);
+  testMultiReducerContainerReset<MultiReducePolicy, NumericType, ForOnePol>(
+      c10);
 }
 
 
+REGISTER_TYPED_TEST_SUITE_P(MultiReducerBasicResetUnitTest, MultiReducerReset);
 
-REGISTER_TYPED_TEST_SUITE_P(MultiReducerBasicResetUnitTest,
-                            MultiReducerReset);
-
-REGISTER_TYPED_TEST_SUITE_P(MultiReducerSingleResetUnitTest,
-                            MultiReducerReset);
+REGISTER_TYPED_TEST_SUITE_P(MultiReducerSingleResetUnitTest, MultiReducerReset);
 
 REGISTER_TYPED_TEST_SUITE_P(MultiReducerContainerResetUnitTest,
                             MultiReducerReset);
diff --git a/test/unit/reducer/test-reducer-constructors-cuda.cpp b/test/unit/reducer/test-reducer-constructors-cuda.cpp
index fea3bb9b90..75889c4706 100644
--- a/test/unit/reducer/test-reducer-constructors-cuda.cpp
+++ b/test/unit/reducer/test-reducer-constructors-cuda.cpp
@@ -6,22 +6,23 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for RAJA reducer constructors and initialization.
+/// Source file containing tests for RAJA reducer constructors and
+/// initialization.
 ///
 
 #include "tests/test-reducer-constructors.hpp"
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaBasicReducerConstructorTypes = 
-  Test< camp::cartesian_product< CudaReducerPolicyList,
+using CudaBasicReducerConstructorTypes =
+    Test<camp::cartesian_product<CudaReducerPolicyList,
                                  DataTypeList,
-                                 CudaResourceList > >::Types;
+                                 CudaResourceList>>::Types;
 
-using CudaInitReducerConstructorTypes = 
-  Test< camp::cartesian_product< CudaReducerPolicyList,
+using CudaInitReducerConstructorTypes =
+    Test<camp::cartesian_product<CudaReducerPolicyList,
                                  DataTypeList,
                                  CudaResourceList,
-                                 CudaUnitTestPolicyList > >::Types;
+                                 CudaUnitTestPolicyList>>::Types;
 
 INSTANTIATE_TYPED_TEST_SUITE_P(CudaBasicTest,
                                ReducerBasicConstructorUnitTest,
@@ -31,4 +32,3 @@ INSTANTIATE_TYPED_TEST_SUITE_P(CudaInitTest,
                                ReducerInitConstructorUnitTest,
                                CudaInitReducerConstructorTypes);
 #endif
-
diff --git a/test/unit/reducer/test-reducer-constructors-hip.cpp b/test/unit/reducer/test-reducer-constructors-hip.cpp
index 0b3197b2ef..c4f4ddb8b4 100644
--- a/test/unit/reducer/test-reducer-constructors-hip.cpp
+++ b/test/unit/reducer/test-reducer-constructors-hip.cpp
@@ -6,22 +6,23 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for RAJA reducer constructors and initialization.
+/// Source file containing tests for RAJA reducer constructors and
+/// initialization.
 ///
 
 #include "tests/test-reducer-constructors.hpp"
 
 #if defined(RAJA_ENABLE_HIP)
-using HipBasicReducerConstructorTypes = 
-  Test< camp::cartesian_product< HipReducerPolicyList,
+using HipBasicReducerConstructorTypes =
+    Test<camp::cartesian_product<HipReducerPolicyList,
                                  DataTypeList,
-                                 HipResourceList > >::Types;
+                                 HipResourceList>>::Types;
 
-using HipInitReducerConstructorTypes = 
-  Test< camp::cartesian_product< HipReducerPolicyList,
+using HipInitReducerConstructorTypes =
+    Test<camp::cartesian_product<HipReducerPolicyList,
                                  DataTypeList,
                                  HipResourceList,
-                                 HipUnitTestPolicyList > >::Types;
+                                 HipUnitTestPolicyList>>::Types;
 
 INSTANTIATE_TYPED_TEST_SUITE_P(HipBasicTest,
                                ReducerBasicConstructorUnitTest,
@@ -31,4 +32,3 @@ INSTANTIATE_TYPED_TEST_SUITE_P(HipInitTest,
                                ReducerInitConstructorUnitTest,
                                HipInitReducerConstructorTypes);
 #endif
-
diff --git a/test/unit/reducer/test-reducer-constructors-openmp-target.cpp b/test/unit/reducer/test-reducer-constructors-openmp-target.cpp
index b3204c7827..3dd9e8ae39 100644
--- a/test/unit/reducer/test-reducer-constructors-openmp-target.cpp
+++ b/test/unit/reducer/test-reducer-constructors-openmp-target.cpp
@@ -6,20 +6,20 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for RAJA reducer constructors and initialization.
+/// Source file containing tests for RAJA reducer constructors and
+/// initialization.
 ///
 
 #include "tests/test-reducer-constructors.hpp"
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetInitReducerConstructorTypes = 
-  Test< camp::cartesian_product< OpenMPTargetReducerPolicyList,
+using OpenMPTargetInitReducerConstructorTypes =
+    Test<camp::cartesian_product<OpenMPTargetReducerPolicyList,
                                  DataTypeList,
                                  OpenMPTargetResourceList,
-                                 SequentialUnitTestPolicyList > >::Types;
+                                 SequentialUnitTestPolicyList>>::Types;
 
 INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPTargetInitTest,
                                ReducerInitConstructorUnitTest,
                                OpenMPTargetInitReducerConstructorTypes);
 #endif
-
diff --git a/test/unit/reducer/test-reducer-constructors-openmp.cpp b/test/unit/reducer/test-reducer-constructors-openmp.cpp
index 26d39cdd5f..eb31791058 100644
--- a/test/unit/reducer/test-reducer-constructors-openmp.cpp
+++ b/test/unit/reducer/test-reducer-constructors-openmp.cpp
@@ -6,22 +6,23 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for RAJA reducer constructors and initialization.
+/// Source file containing tests for RAJA reducer constructors and
+/// initialization.
 ///
 
 #include "tests/test-reducer-constructors.hpp"
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPBasicReducerConstructorTypes = 
-  Test< camp::cartesian_product< OpenMPReducerPolicyList,
+using OpenMPBasicReducerConstructorTypes =
+    Test<camp::cartesian_product<OpenMPReducerPolicyList,
                                  DataTypeList,
-                                 HostResourceList > >::Types;
+                                 HostResourceList>>::Types;
 
-using OpenMPInitReducerConstructorTypes = 
-  Test< camp::cartesian_product< OpenMPReducerPolicyList,
+using OpenMPInitReducerConstructorTypes =
+    Test<camp::cartesian_product<OpenMPReducerPolicyList,
                                  DataTypeList,
                                  HostResourceList,
-                                 SequentialUnitTestPolicyList > >::Types;
+                                 SequentialUnitTestPolicyList>>::Types;
 
 INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPBasicTest,
                                ReducerBasicConstructorUnitTest,
@@ -31,4 +32,3 @@ INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPInitTest,
                                ReducerInitConstructorUnitTest,
                                OpenMPInitReducerConstructorTypes);
 #endif
-
diff --git a/test/unit/reducer/test-reducer-constructors-seq.cpp b/test/unit/reducer/test-reducer-constructors-seq.cpp
index 134766eb9a..7d765529f8 100644
--- a/test/unit/reducer/test-reducer-constructors-seq.cpp
+++ b/test/unit/reducer/test-reducer-constructors-seq.cpp
@@ -6,21 +6,22 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for RAJA reducer constructors and initialization.
+/// Source file containing tests for RAJA reducer constructors and
+/// initialization.
 ///
 
 #include "tests/test-reducer-constructors.hpp"
 
-using SequentialBasicReducerConstructorTypes = 
-  Test< camp::cartesian_product< SequentialReducerPolicyList,
+using SequentialBasicReducerConstructorTypes =
+    Test<camp::cartesian_product<SequentialReducerPolicyList,
                                  DataTypeList,
-                                 HostResourceList > >::Types;
+                                 HostResourceList>>::Types;
 
-using SequentialInitReducerConstructorTypes = 
-  Test< camp::cartesian_product< SequentialReducerPolicyList,
+using SequentialInitReducerConstructorTypes =
+    Test<camp::cartesian_product<SequentialReducerPolicyList,
                                  DataTypeList,
                                  HostResourceList,
-                                 SequentialUnitTestPolicyList > >::Types;
+                                 SequentialUnitTestPolicyList>>::Types;
 
 INSTANTIATE_TYPED_TEST_SUITE_P(SequentialBasicTest,
                                ReducerBasicConstructorUnitTest,
@@ -29,5 +30,3 @@ INSTANTIATE_TYPED_TEST_SUITE_P(SequentialBasicTest,
 INSTANTIATE_TYPED_TEST_SUITE_P(SequentialInitTest,
                                ReducerInitConstructorUnitTest,
                                SequentialInitReducerConstructorTypes);
-
-
diff --git a/test/unit/reducer/test-reducer-reset-cuda.cpp b/test/unit/reducer/test-reducer-reset-cuda.cpp
index 06944d488d..2443419c7d 100644
--- a/test/unit/reducer/test-reducer-reset-cuda.cpp
+++ b/test/unit/reducer/test-reducer-reset-cuda.cpp
@@ -12,11 +12,11 @@
 #include "tests/test-reducer-reset.hpp"
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaReducerResetTypes = 
-  Test< camp::cartesian_product< CudaReducerPolicyList,
+using CudaReducerResetTypes =
+    Test<camp::cartesian_product<CudaReducerPolicyList,
                                  DataTypeList,
                                  CudaResourceList,
-                                 CudaUnitTestPolicyList > >::Types;
+                                 CudaUnitTestPolicyList>>::Types;
 
 
 INSTANTIATE_TYPED_TEST_SUITE_P(CudaResetTest,
diff --git a/test/unit/reducer/test-reducer-reset-hip.cpp b/test/unit/reducer/test-reducer-reset-hip.cpp
index cfca5e3787..eb31480311 100644
--- a/test/unit/reducer/test-reducer-reset-hip.cpp
+++ b/test/unit/reducer/test-reducer-reset-hip.cpp
@@ -12,11 +12,11 @@
 #include "tests/test-reducer-reset.hpp"
 
 #if defined(RAJA_ENABLE_HIP)
-using HipReducerResetTypes = 
-  Test< camp::cartesian_product< HipReducerPolicyList,
+using HipReducerResetTypes =
+    Test<camp::cartesian_product<HipReducerPolicyList,
                                  DataTypeList,
                                  HipResourceList,
-                                 HipUnitTestPolicyList > >::Types;
+                                 HipUnitTestPolicyList>>::Types;
 
 
 INSTANTIATE_TYPED_TEST_SUITE_P(HipResetTest,
diff --git a/test/unit/reducer/test-reducer-reset-openmp-target.cpp b/test/unit/reducer/test-reducer-reset-openmp-target.cpp
index 1bf7777bf1..5f02ec92ea 100644
--- a/test/unit/reducer/test-reducer-reset-openmp-target.cpp
+++ b/test/unit/reducer/test-reducer-reset-openmp-target.cpp
@@ -12,11 +12,11 @@
 #include "tests/test-reducer-reset.hpp"
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetReducerResetTypes = 
-  Test< camp::cartesian_product< OpenMPTargetReducerPolicyList,
+using OpenMPTargetReducerResetTypes =
+    Test<camp::cartesian_product<OpenMPTargetReducerPolicyList,
                                  DataTypeList,
                                  OpenMPTargetResourceList,
-                                 SequentialUnitTestPolicyList > >::Types;
+                                 SequentialUnitTestPolicyList>>::Types;
 
 
 INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPTargetResetTest,
diff --git a/test/unit/reducer/test-reducer-reset-openmp.cpp b/test/unit/reducer/test-reducer-reset-openmp.cpp
index 3f8d54287f..a570a7be6a 100644
--- a/test/unit/reducer/test-reducer-reset-openmp.cpp
+++ b/test/unit/reducer/test-reducer-reset-openmp.cpp
@@ -12,11 +12,11 @@
 #include "tests/test-reducer-reset.hpp"
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPReducerResetTypes = 
-  Test< camp::cartesian_product< OpenMPReducerPolicyList,
+using OpenMPReducerResetTypes =
+    Test<camp::cartesian_product<OpenMPReducerPolicyList,
                                  DataTypeList,
                                  HostResourceList,
-                                 SequentialUnitTestPolicyList > >::Types;
+                                 SequentialUnitTestPolicyList>>::Types;
 
 
 INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPResetTest,
diff --git a/test/unit/reducer/test-reducer-reset-seq.cpp b/test/unit/reducer/test-reducer-reset-seq.cpp
index 2b1ff4a748..5884aa43e4 100644
--- a/test/unit/reducer/test-reducer-reset-seq.cpp
+++ b/test/unit/reducer/test-reducer-reset-seq.cpp
@@ -11,14 +11,13 @@
 
 #include "tests/test-reducer-reset.hpp"
 
-using SequentialReducerResetTypes = 
-  Test< camp::cartesian_product< SequentialReducerPolicyList,
+using SequentialReducerResetTypes =
+    Test<camp::cartesian_product<SequentialReducerPolicyList,
                                  DataTypeList,
                                  HostResourceList,
-                                 SequentialUnitTestPolicyList > >::Types;
+                                 SequentialUnitTestPolicyList>>::Types;
 
 
 INSTANTIATE_TYPED_TEST_SUITE_P(SequentialResetTest,
                                ReducerResetUnitTest,
                                SequentialReducerResetTypes);
-
diff --git a/test/unit/reducer/test-reducer.hpp b/test/unit/reducer/test-reducer.hpp
index aa8fbda9cf..55fa58f6ee 100644
--- a/test/unit/reducer/test-reducer.hpp
+++ b/test/unit/reducer/test-reducer.hpp
@@ -16,27 +16,25 @@
 //
 // Data types
 //
-using DataTypeList = camp::list< int,
-                                 float,
-                                 double >;
+using DataTypeList = camp::list<int, float, double>;
 
-using SequentialReducerPolicyList = camp::list< RAJA::seq_reduce >;
+using SequentialReducerPolicyList = camp::list<RAJA::seq_reduce>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPReducerPolicyList = camp::list< RAJA::omp_reduce,
-                                            RAJA::omp_reduce_ordered >;
+using OpenMPReducerPolicyList =
+    camp::list<RAJA::omp_reduce, RAJA::omp_reduce_ordered>;
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetReducerPolicyList = camp::list< RAJA::omp_target_reduce >;
+using OpenMPTargetReducerPolicyList = camp::list<RAJA::omp_target_reduce>;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaReducerPolicyList = camp::list< RAJA::cuda_reduce >;
+using CudaReducerPolicyList = camp::list<RAJA::cuda_reduce>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipReducerPolicyList = camp::list< RAJA::hip_reduce >;
+using HipReducerPolicyList = camp::list<RAJA::hip_reduce>;
 #endif
 
 #endif  // __TEST_REDUCER_UTILS_HPP__
diff --git a/test/unit/reducer/tests/test-reducer-constructors.hpp b/test/unit/reducer/tests/test-reducer-constructors.hpp
index d02d42fce9..d771a77b77 100644
--- a/test/unit/reducer/tests/test-reducer-constructors.hpp
+++ b/test/unit/reducer/tests/test-reducer-constructors.hpp
@@ -6,7 +6,8 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Header file containing tests for RAJA reducer constructors and initialization.
+/// Header file containing tests for RAJA reducer constructors and
+/// initialization.
 ///
 
 #ifndef __TEST_REDUCER_CONSTRUCTOR__
@@ -18,29 +19,26 @@
 
 template <typename T>
 class ReducerBasicConstructorUnitTest : public ::testing::Test
-{
-};
+{};
 
 template <typename T>
 class ReducerInitConstructorUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(ReducerBasicConstructorUnitTest);
 TYPED_TEST_SUITE_P(ReducerInitConstructorUnitTest);
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-template <typename ReducePolicy,
-          typename NumericType>
-typename  std::enable_if<
-#if defined(RAJA_ENABLE_CUDA) // CUDA policy does nothing.
-            std::is_same<ReducePolicy, RAJA::cuda_reduce>::value
-#elif defined(RAJA_ENABLE_HIP) // HIP policy does nothing.
-            std::is_same<ReducePolicy, RAJA::hip_reduce>::value
+template <typename ReducePolicy, typename NumericType>
+typename std::enable_if<
+#if defined(RAJA_ENABLE_CUDA)  // CUDA policy does nothing.
+    std::is_same<ReducePolicy, RAJA::cuda_reduce>::value
+#elif defined(RAJA_ENABLE_HIP)  // HIP policy does nothing.
+    std::is_same<ReducePolicy, RAJA::hip_reduce>::value
 #else
 #error Please enable a supported GPU platform, e.g. CUDA or HIP.
 #endif
-          >::type
+    >::type
 testReducerConstructor()
 {
   // do nothing
@@ -51,15 +49,15 @@ testReducerConstructor()
 // Should not run this on a GPU.
 template <typename ReducePolicy,
           typename NumericType>
-typename  std::enable_if< // CPU policy.
+typename std::enable_if<  // CPU policy.
 #if defined(RAJA_ENABLE_CUDA)
-            !std::is_same<ReducePolicy, RAJA::cuda_reduce>::value
+    !std::is_same<ReducePolicy, RAJA::cuda_reduce>::value
 #elif defined(RAJA_ENABLE_HIP)
-            !std::is_same<ReducePolicy, RAJA::hip_reduce>::value
+    !std::is_same<ReducePolicy, RAJA::hip_reduce>::value
 #else
-            true  // Always run for non-GPU policies.
+    true  // Always run for non-GPU policies.
 #endif
-          >::type
+    >::type
 testReducerConstructor()
 {
   RAJA::ReduceSum<ReducePolicy, NumericType> reduce_sum;
@@ -68,8 +66,12 @@ testReducerConstructor()
   RAJA::ReduceMinLoc<ReducePolicy, NumericType> reduce_minloc;
   RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc;
 
-  RAJA::ReduceMinLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_minloctup;
-  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_maxloctup;
+  RAJA::ReduceMinLoc<ReducePolicy, NumericType,
+                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+      reduce_minloctup;
+  RAJA::ReduceMaxLoc<ReducePolicy, NumericType,
+                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+      reduce_maxloctup;
 
   ASSERT_EQ((NumericType)reduce_sum.get(), NumericType());
   ASSERT_EQ((NumericType)reduce_min.get(), NumericType());
@@ -82,44 +84,48 @@ testReducerConstructor()
 
   ASSERT_EQ((NumericType)reduce_minloctup.get(), NumericType());
   ASSERT_EQ((NumericType)reduce_maxloctup.get(), NumericType());
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())), RAJA::Index_type());
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())), RAJA::Index_type());
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())), RAJA::Index_type());
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())), RAJA::Index_type());
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())),
+            RAJA::Index_type());
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())),
+            RAJA::Index_type());
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())),
+            RAJA::Index_type());
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())),
+            RAJA::Index_type());
 }
 
 TYPED_TEST_P(ReducerBasicConstructorUnitTest, BasicReducerConstructor)
 {
   using ReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using NumericType  = typename camp::at<TypeParam, camp::num<1>>::type;
 
-  testReducerConstructor< ReducePolicy, NumericType >();
+  testReducerConstructor<ReducePolicy, NumericType>();
 }
 
-template  < typename ReducePolicy,
-            typename NumericType,
-            typename ForOnePol >
-typename  std::enable_if< // Host policy does nothing.
-            std::is_base_of<RunOnHost, ForOnePol>::value
-          >::type
-exec_dispatcher( NumericType * RAJA_UNUSED_ARG(initVal) )
+template <typename ReducePolicy,
+          typename NumericType,
+          typename ForOnePol>
+typename std::enable_if<  // Host policy does nothing.
+    std::is_base_of<RunOnHost, ForOnePol>::value>::type
+exec_dispatcher(NumericType* RAJA_UNUSED_ARG(initVal))
 {
   // Do nothing for host policies.
 }
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-template  < typename ReducePolicy,
-            typename NumericType,
-            typename ForOnePol >
-typename  std::enable_if< // GPU policy fiddles with value.
-            std::is_base_of<RunOnDevice, ForOnePol>::value
-          >::type
-exec_dispatcher( NumericType * initVal )
+template <typename ReducePolicy,
+          typename NumericType,
+          typename ForOnePol>
+typename std::enable_if<  // GPU policy fiddles with value.
+    std::is_base_of<RunOnDevice, ForOnePol>::value>::type
+exec_dispatcher(NumericType* initVal)
 {
-  forone<ForOnePol>( [=] __device__ () {
-                        initVal[0] += 1;
-                        initVal[0] -= 1;
-                 });
+  forone<ForOnePol>(
+      [=] __device__()
+      {
+        initVal[0] += 1;
+        initVal[0] -= 1;
+      });
 }
 #endif
 
@@ -129,27 +135,27 @@ template <typename ReducePolicy,
           typename ForOnePol>
 void testInitReducerConstructor()
 {
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
-  NumericType * theVal = nullptr;
-  NumericType * workVal = nullptr;
+  NumericType* theVal  = nullptr;
+  NumericType* workVal = nullptr;
 
   NumericType initVal = (NumericType)5;
 
   workVal = work_res.allocate<NumericType>(1);
-  theVal = host_res.allocate<NumericType>(1);
+  theVal  = host_res.allocate<NumericType>(1);
 
-  work_res.memcpy( workVal, &initVal, sizeof(initVal) );
+  work_res.memcpy(workVal, &initVal, sizeof(initVal));
   theVal[0] = (NumericType)10;
 
-  #if defined(RAJA_ENABLE_CUDA)
+#if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
-  #endif
+#endif
 
-  #if defined(RAJA_ENABLE_HIP)
+#if defined(RAJA_ENABLE_HIP)
   hipErrchk(hipDeviceSynchronize());
-  #endif
+#endif
 
   RAJA::ReduceSum<ReducePolicy, NumericType> reduce_sum(initVal);
   RAJA::ReduceMin<ReducePolicy, NumericType> reduce_min(initVal);
@@ -158,17 +164,17 @@ void testInitReducerConstructor()
   RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc(initVal, 1);
 
   RAJA::tuple<RAJA::Index_type, RAJA::Index_type> LocTup(1, 1);
-  RAJA::ReduceMinLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_minloctup(initVal, LocTup);
-  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_maxloctup(initVal, LocTup);
+  RAJA::ReduceMinLoc<ReducePolicy, NumericType,
+                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+      reduce_minloctup(initVal, LocTup);
+  RAJA::ReduceMaxLoc<ReducePolicy, NumericType,
+                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+      reduce_maxloctup(initVal, LocTup);
 
   // move a value onto device and fiddle
-  exec_dispatcher < ReducePolicy,
-                    NumericType,
-                    ForOnePol
-                  >
-                  ( workVal );
+  exec_dispatcher<ReducePolicy, NumericType, ForOnePol>(workVal);
 
-  work_res.memcpy( &initVal, workVal, sizeof(initVal) );
+  work_res.memcpy(&initVal, workVal, sizeof(initVal));
 
   theVal[0] = initVal;
 
@@ -185,23 +191,28 @@ void testInitReducerConstructor()
 
   ASSERT_EQ((NumericType)reduce_minloctup.get(), (NumericType)(initVal));
   ASSERT_EQ((NumericType)reduce_maxloctup.get(), (NumericType)(initVal));
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())), (RAJA::Index_type)1);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())), (RAJA::Index_type)1);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())), (RAJA::Index_type)1);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())), (RAJA::Index_type)1);
-
-  work_res.deallocate( workVal );
-  host_res.deallocate( theVal );
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())),
+            (RAJA::Index_type)1);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())),
+            (RAJA::Index_type)1);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())),
+            (RAJA::Index_type)1);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())),
+            (RAJA::Index_type)1);
+
+  work_res.deallocate(workVal);
+  host_res.deallocate(theVal);
 }
 
 TYPED_TEST_P(ReducerInitConstructorUnitTest, InitReducerConstructor)
 {
-  using ReduceType = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ReduceType   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using NumericType  = typename camp::at<TypeParam, camp::num<1>>::type;
   using ResourceType = typename camp::at<TypeParam, camp::num<2>>::type;
-  using ForOneType = typename camp::at<TypeParam, camp::num<3>>::type;
+  using ForOneType   = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  testInitReducerConstructor< ReduceType, NumericType, ResourceType, ForOneType >();
+  testInitReducerConstructor<ReduceType, NumericType, ResourceType,
+                             ForOneType>();
 }
 
 
diff --git a/test/unit/reducer/tests/test-reducer-reset.hpp b/test/unit/reducer/tests/test-reducer-reset.hpp
index b82ae2995f..c976545fc8 100644
--- a/test/unit/reducer/tests/test-reducer-reset.hpp
+++ b/test/unit/reducer/tests/test-reducer-reset.hpp
@@ -16,97 +16,98 @@
 
 #include "../test-reducer.hpp"
 
-template  < typename ReducePolicy,
-            typename NumericType,
-            typename Indexer,
-            typename Tuple,
-            typename ForOnePol
-          >
-typename  std::enable_if< // Empty function for non-device policy.
-            std::is_base_of<RunOnHost, ForOnePol>::value
-          >::type
-exec_dispatcher(  RAJA::ReduceSum<ReducePolicy, NumericType> & RAJA_UNUSED_ARG(reduce_sum),
-                  RAJA::ReduceMin<ReducePolicy, NumericType> & RAJA_UNUSED_ARG(reduce_min),
-                  RAJA::ReduceMax<ReducePolicy, NumericType> & RAJA_UNUSED_ARG(reduce_max),
-                  RAJA::ReduceMinLoc<ReducePolicy, NumericType, Indexer> & RAJA_UNUSED_ARG(reduce_minloc),
-                  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Indexer> & RAJA_UNUSED_ARG(reduce_maxloc),
-                  RAJA::ReduceMinLoc<ReducePolicy, NumericType, Tuple> & RAJA_UNUSED_ARG(reduce_minloctup),
-                  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Tuple> & RAJA_UNUSED_ARG(reduce_maxloctup),
-                  NumericType RAJA_UNUSED_ARG(initVal)
-               )
+template <typename ReducePolicy,
+          typename NumericType,
+          typename Indexer,
+          typename Tuple,
+          typename ForOnePol>
+typename std::enable_if<  // Empty function for non-device policy.
+    std::is_base_of<RunOnHost, ForOnePol>::value>::type
+exec_dispatcher(
+    RAJA::ReduceSum<ReducePolicy, NumericType>& RAJA_UNUSED_ARG(reduce_sum),
+    RAJA::ReduceMin<ReducePolicy, NumericType>& RAJA_UNUSED_ARG(reduce_min),
+    RAJA::ReduceMax<ReducePolicy, NumericType>& RAJA_UNUSED_ARG(reduce_max),
+    RAJA::ReduceMinLoc<ReducePolicy, NumericType, Indexer>& RAJA_UNUSED_ARG(
+        reduce_minloc),
+    RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Indexer>& RAJA_UNUSED_ARG(
+        reduce_maxloc),
+    RAJA::ReduceMinLoc<ReducePolicy, NumericType, Tuple>& RAJA_UNUSED_ARG(
+        reduce_minloctup),
+    RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Tuple>& RAJA_UNUSED_ARG(
+        reduce_maxloctup),
+    NumericType RAJA_UNUSED_ARG(initVal))
 {
   // Non-device policies should do nothing.
 }
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-template  < typename ReducePolicy,
-            typename NumericType,
-            typename Indexer,
-            typename Tuple,
-            typename ForOnePol
-          >
-typename  std::enable_if< // GPU policy execution.
-            std::is_base_of<RunOnDevice, ForOnePol>::value
-          >::type
-exec_dispatcher(  RAJA::ReduceSum<ReducePolicy, NumericType> & reduce_sum,
-                  RAJA::ReduceMin<ReducePolicy, NumericType> & reduce_min,
-                  RAJA::ReduceMax<ReducePolicy, NumericType> & reduce_max,
-                  RAJA::ReduceMinLoc<ReducePolicy, NumericType, Indexer> & reduce_minloc,
-                  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Indexer> & reduce_maxloc,
-                  RAJA::ReduceMinLoc<ReducePolicy, NumericType, Tuple> & reduce_minloctup,
-                  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Tuple> & reduce_maxloctup,
-                  NumericType initVal
-               )
+template <typename ReducePolicy,
+          typename NumericType,
+          typename Indexer,
+          typename Tuple,
+          typename ForOnePol>
+typename std::enable_if<  // GPU policy execution.
+    std::is_base_of<RunOnDevice, ForOnePol>::value>::type
+exec_dispatcher(
+    RAJA::ReduceSum<ReducePolicy, NumericType>& reduce_sum,
+    RAJA::ReduceMin<ReducePolicy, NumericType>& reduce_min,
+    RAJA::ReduceMax<ReducePolicy, NumericType>& reduce_max,
+    RAJA::ReduceMinLoc<ReducePolicy, NumericType, Indexer>& reduce_minloc,
+    RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Indexer>& reduce_maxloc,
+    RAJA::ReduceMinLoc<ReducePolicy, NumericType, Tuple>& reduce_minloctup,
+    RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Tuple>& reduce_maxloctup,
+    NumericType initVal)
 {
   // Use device to activate any value for each reducer.
-  forone<ForOnePol>( [=] __host__ __device__ () {
-                    Tuple temploc(0,0);
-                    reduce_sum += initVal;
-                    reduce_min.min(0);
-                    reduce_max.max(0);
-                    reduce_minloc.minloc(0,0);
-                    reduce_maxloc.maxloc(0,0);
-                    reduce_minloctup.minloc(0,temploc);
-                    reduce_maxloctup.maxloc(0,temploc);
-                 });
+  forone<ForOnePol>(
+      [=] __host__ __device__()
+      {
+        Tuple temploc(0, 0);
+        reduce_sum += initVal;
+        reduce_min.min(0);
+        reduce_max.max(0);
+        reduce_minloc.minloc(0, 0);
+        reduce_maxloc.maxloc(0, 0);
+        reduce_minloctup.minloc(0, temploc);
+        reduce_maxloctup.maxloc(0, temploc);
+      });
   // Relying on implicit device synchronization in forone.
 }
 #endif
 
 template <typename T>
 class ReducerResetUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(ReducerResetUnitTest);
 
-template <  typename ReducePolicy,
-            typename NumericType,
-            typename WORKING_RES,
-            typename ForOnePol  >
+template <typename ReducePolicy,
+          typename NumericType,
+          typename WORKING_RES,
+          typename ForOnePol>
 void testReducerReset()
 {
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
-  NumericType * resetVal = nullptr;
-  NumericType * workVal = nullptr;
+  NumericType* resetVal = nullptr;
+  NumericType* workVal  = nullptr;
 
   NumericType initVal = (NumericType)5;
 
-  workVal = work_res.allocate<NumericType>(1);
+  workVal  = work_res.allocate<NumericType>(1);
   resetVal = host_res.allocate<NumericType>(1);
 
-  work_res.memcpy( workVal, &initVal, sizeof(initVal) );
+  work_res.memcpy(workVal, &initVal, sizeof(initVal));
   resetVal[0] = (NumericType)10;
 
-  #if defined(RAJA_ENABLE_CUDA)
+#if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
-  #endif
+#endif
 
-  #if defined(RAJA_ENABLE_HIP)
+#if defined(RAJA_ENABLE_HIP)
   hipErrchk(hipDeviceSynchronize());
-  #endif
+#endif
 
   RAJA::ReduceSum<ReducePolicy, NumericType> reduce_sum(initVal);
   RAJA::ReduceMin<ReducePolicy, NumericType> reduce_min(initVal);
@@ -115,25 +116,18 @@ void testReducerReset()
   RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc(initVal, 1);
 
   RAJA::tuple<RAJA::Index_type, RAJA::Index_type> LocTup(1, 1);
-  RAJA::ReduceMinLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_minloctup(initVal, LocTup);
-  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_maxloctup(initVal, LocTup);
+  RAJA::ReduceMinLoc<ReducePolicy, NumericType,
+                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+      reduce_minloctup(initVal, LocTup);
+  RAJA::ReduceMaxLoc<ReducePolicy, NumericType,
+                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+      reduce_maxloctup(initVal, LocTup);
 
   // initiate some device computation if using device policy
-  exec_dispatcher < ReducePolicy,
-                    NumericType,
-                    RAJA::Index_type,
-                    RAJA::tuple<RAJA::Index_type, RAJA::Index_type>,
-                    ForOnePol
-                  >
-                 (  reduce_sum,
-                    reduce_min,
-                    reduce_max,
-                    reduce_minloc,
-                    reduce_maxloc,
-                    reduce_minloctup,
-                    reduce_maxloctup,
-                    initVal
-                 );
+  exec_dispatcher<ReducePolicy, NumericType, RAJA::Index_type,
+                  RAJA::tuple<RAJA::Index_type, RAJA::Index_type>, ForOnePol>(
+      reduce_sum, reduce_min, reduce_max, reduce_minloc, reduce_maxloc,
+      reduce_minloctup, reduce_maxloctup, initVal);
 
   // perform real host resets
   reduce_sum.reset(resetVal[0]);
@@ -159,10 +153,14 @@ void testReducerReset()
   ASSERT_EQ((NumericType)reduce_maxloctup.get(), (NumericType)(resetVal[0]));
 
   // Reset of tuple loc defaults to 0
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())), (RAJA::Index_type)0);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())), (RAJA::Index_type)0);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())), (RAJA::Index_type)0);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())), (RAJA::Index_type)0);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())),
+            (RAJA::Index_type)0);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())),
+            (RAJA::Index_type)0);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())),
+            (RAJA::Index_type)0);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())),
+            (RAJA::Index_type)0);
 
   // reset locs to default of -1.
   reduce_minloc.reset(resetVal[0], -1);
@@ -171,20 +169,19 @@ void testReducerReset()
   ASSERT_EQ((RAJA::Index_type)reduce_minloc.getLoc(), (RAJA::Index_type)(-1));
   ASSERT_EQ((RAJA::Index_type)reduce_maxloc.getLoc(), (RAJA::Index_type)(-1));
 
-  work_res.deallocate( workVal );
-  host_res.deallocate( resetVal );
+  work_res.deallocate(workVal);
+  host_res.deallocate(resetVal);
 }
 
 TYPED_TEST_P(ReducerResetUnitTest, BasicReset)
 {
-  using ReduceType = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ReduceType   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using NumericType  = typename camp::at<TypeParam, camp::num<1>>::type;
   using ResourceType = typename camp::at<TypeParam, camp::num<2>>::type;
-  using ForOneType = typename camp::at<TypeParam, camp::num<3>>::type;
-  testReducerReset< ReduceType, NumericType, ResourceType, ForOneType >();
+  using ForOneType   = typename camp::at<TypeParam, camp::num<3>>::type;
+  testReducerReset<ReduceType, NumericType, ResourceType, ForOneType>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ReducerResetUnitTest,
-                            BasicReset);
+REGISTER_TYPED_TEST_SUITE_P(ReducerResetUnitTest, BasicReset);
 
 #endif  //__TEST_REDUCER_RESET__
diff --git a/test/unit/resource/tests/test-resource-AsyncTime.hpp b/test/unit/resource/tests/test-resource-AsyncTime.hpp
index 806ba66b26..55fcf488ab 100644
--- a/test/unit/resource/tests/test-resource-AsyncTime.hpp
+++ b/test/unit/resource/tests/test-resource-AsyncTime.hpp
@@ -14,16 +14,16 @@
 #include "RAJA/util/Timer.hpp"
 
 #if defined(RAJA_ENABLE_CUDA)
-inline __host__ __device__ void
-gpu_time_wait_for(float time, float clockrate) {
-  clock_t time_in_clocks = time*clockrate;
+inline __host__ __device__ void gpu_time_wait_for(float time, float clockrate)
+{
+  clock_t time_in_clocks = time * clockrate;
 
-  unsigned int start_clock = (unsigned int) clock();
-  clock_t clock_offset = 0;
+  unsigned int start_clock = (unsigned int)clock();
+  clock_t clock_offset     = 0;
   while (clock_offset < time_in_clocks)
   {
-    unsigned int end_clock = (unsigned int) clock();
-    clock_offset = (clock_t)(end_clock - start_clock);
+    unsigned int end_clock = (unsigned int)clock();
+    clock_offset           = (clock_t)(end_clock - start_clock);
   }
 }
 
@@ -39,61 +39,61 @@ int get_clockrate()
     printf("  CUDA kernel runs will be serialized\n");
     return -1;
   }
-  //printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
-  //    deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
+  // printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
+  //     deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
 
 #if defined(__arm__) || defined(__aarch64__)
-  return deviceProp.clockRate/1000;
+  return deviceProp.clockRate / 1000;
 #else
   return deviceProp.clockRate;
 #endif
 }
 
 template <typename WORKING_RES, typename EXEC_POL>
-void ResourceAsyncTimeTestImpl(EXEC_POL&&) {}
+void ResourceAsyncTimeTestImpl(EXEC_POL&&)
+{}
 
 template <typename WORKING_RES, size_t BLOCK_SIZE, bool Async>
 void ResourceAsyncTimeTestImpl(RAJA::cuda_exec<BLOCK_SIZE, Async>&&)
 {
-  constexpr std::size_t ARRAY_SIZE{10000};
+  constexpr std::size_t ARRAY_SIZE {10000};
   using namespace RAJA;
 
-  constexpr std::size_t NUM_STREAMS{8};
+  constexpr std::size_t NUM_STREAMS {8};
   WORKING_RES dev[NUM_STREAMS];
   resources::Host host;
 
-  int clockrate{get_clockrate()};
+  int clockrate {get_clockrate()};
   ASSERT_TRUE(clockrate != -1);
 
   using AsyncExecPol = RAJA::cuda_exec<BLOCK_SIZE, true>;
-  using SyncExecPol = RAJA::cuda_exec<BLOCK_SIZE, false>;
+  using SyncExecPol  = RAJA::cuda_exec<BLOCK_SIZE, false>;
 
   RAJA::Timer sync_timer;
   sync_timer.start();
-  for (std::size_t stream = 0; stream < NUM_STREAMS; ++stream){
-    forall<SyncExecPol>(dev[stream], RangeSegment(0,ARRAY_SIZE),
-      [=] RAJA_HOST_DEVICE (int i) {
-        gpu_time_wait_for(100, clockrate);
-      }
-    );
+  for (std::size_t stream = 0; stream < NUM_STREAMS; ++stream)
+  {
+    forall<SyncExecPol>(dev[stream], RangeSegment(0, ARRAY_SIZE),
+                        [=] RAJA_HOST_DEVICE(int i)
+                        { gpu_time_wait_for(100, clockrate); });
   }
   sync_timer.stop();
   RAJA::Timer::ElapsedType t_sync = sync_timer.elapsed();
 
   RAJA::Timer async_timer;
   async_timer.start();
-  for (std::size_t stream = 0; stream < NUM_STREAMS; ++stream){
-    forall<AsyncExecPol>(dev[stream], RangeSegment(0,ARRAY_SIZE),
-      [=] RAJA_HOST_DEVICE (int i) {
-        gpu_time_wait_for(100, clockrate);
-      }
-    );
+  for (std::size_t stream = 0; stream < NUM_STREAMS; ++stream)
+  {
+    forall<AsyncExecPol>(dev[stream], RangeSegment(0, ARRAY_SIZE),
+                         [=] RAJA_HOST_DEVICE(int i)
+                         { gpu_time_wait_for(100, clockrate); });
   }
   async_timer.stop();
   RAJA::Timer::ElapsedType t_async = async_timer.elapsed();
 
-  // We expect "total async time" to be roughly equal to "total sync time" / NUM_STREAMS.
-  // For comparison tolerance, we multiple the latter by 2 in the check.
+  // We expect "total async time" to be roughly equal to "total sync time" /
+  // NUM_STREAMS. For comparison tolerance, we multiple the latter by 2 in the
+  // check.
   ASSERT_LT(t_async, 2 * (t_sync / NUM_STREAMS));
 }
 
@@ -106,15 +106,15 @@ void ResourceAsyncTimeTestCall()
 #else
 
 template <typename WORKING_RES, typename EXEC_POLICY>
-void ResourceAsyncTimeTestCall() {}
+void ResourceAsyncTimeTestCall()
+{}
 
 #endif
 
 TYPED_TEST_SUITE_P(ResourceAsyncTimeTest);
 template <typename T>
 class ResourceAsyncTimeTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ResourceAsyncTimeTest, ResourceAsyncTime)
 {
@@ -124,7 +124,6 @@ TYPED_TEST_P(ResourceAsyncTimeTest, ResourceAsyncTime)
   ResourceAsyncTimeTestCall<WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ResourceAsyncTimeTest,
-                            ResourceAsyncTime);
+REGISTER_TYPED_TEST_SUITE_P(ResourceAsyncTimeTest, ResourceAsyncTime);
 
 #endif  // __TEST_RESOURCE_ASYNC_HPP__
diff --git a/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp b/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp
index b1939240e4..8de49f37d7 100644
--- a/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp
+++ b/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp
@@ -13,49 +13,40 @@
 template <typename WORKING_RES, typename EXEC_POLICY>
 void ResourceBasicAsyncSemanticsTestImpl()
 {
-  constexpr std::size_t ARRAY_SIZE{10000000};
+  constexpr std::size_t ARRAY_SIZE {10000000};
   using namespace RAJA;
 
   WORKING_RES dev;
   resources::Host host;
 
-  int* d_array = resources::Resource{dev}.allocate<int>(ARRAY_SIZE);
-  int* h_array  = host.allocate<int>(ARRAY_SIZE);
+  int* d_array = resources::Resource {dev}.allocate<int>(ARRAY_SIZE);
+  int* h_array = host.allocate<int>(ARRAY_SIZE);
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      h_array[i] = i;
-    }
-  );
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
+                                       [=] RAJA_HOST_DEVICE(int i)
+                                       { h_array[i] = i; });
 
   dev.memcpy(d_array, h_array, sizeof(int) * ARRAY_SIZE);
 
-  forall<EXEC_POLICY>(dev, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      d_array[i] = i + 2;
-    }
-  );
+  forall<EXEC_POLICY>(dev, RangeSegment(0, ARRAY_SIZE),
+                      [=] RAJA_HOST_DEVICE(int i) { d_array[i] = i + 2; });
 
   dev.memcpy(h_array, d_array, sizeof(int) * ARRAY_SIZE);
 
   dev.wait();
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0,ARRAY_SIZE),
-    [=] (int i) {
-      ASSERT_EQ(h_array[i], i + 2); 
-    }
-  );
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
+                                       [=](int i)
+                                       { ASSERT_EQ(h_array[i], i + 2); });
 
   dev.deallocate(d_array);
   host.deallocate(h_array);
-  
 }
 
 TYPED_TEST_SUITE_P(ResourceBasicAsyncSemanticsTest);
 template <typename T>
 class ResourceBasicAsyncSemanticsTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ResourceBasicAsyncSemanticsTest, ResourceBasicAsyncSemantics)
 {
diff --git a/test/unit/resource/tests/test-resource-Depends.hpp b/test/unit/resource/tests/test-resource-Depends.hpp
index 0c1b748de2..c7b050dd18 100644
--- a/test/unit/resource/tests/test-resource-Depends.hpp
+++ b/test/unit/resource/tests/test-resource-Depends.hpp
@@ -13,59 +13,48 @@
 template <typename WORKING_RES, typename EXEC_POLICY>
 void ResourceDependsTestImpl()
 {
-  constexpr std::size_t ARRAY_SIZE{10000};
+  constexpr std::size_t ARRAY_SIZE {10000};
   using namespace RAJA;
 
   WORKING_RES dev1;
   WORKING_RES dev2;
   resources::Host host;
 
-  int* d_array1 = resources::Resource{dev1}.allocate<int>(ARRAY_SIZE);
-  int* d_array2 = resources::Resource{dev2}.allocate<int>(ARRAY_SIZE);
+  int* d_array1 = resources::Resource {dev1}.allocate<int>(ARRAY_SIZE);
+  int* d_array2 = resources::Resource {dev2}.allocate<int>(ARRAY_SIZE);
   int* h_array  = host.allocate<int>(ARRAY_SIZE);
 
 
-  forall<EXEC_POLICY>(dev1, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      d_array1[i] = i;
-    }
-  );
+  forall<EXEC_POLICY>(dev1, RangeSegment(0, ARRAY_SIZE),
+                      [=] RAJA_HOST_DEVICE(int i) { d_array1[i] = i; });
 
-  resources::Event e = forall<EXEC_POLICY>(dev2, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      d_array2[i] = -1;
-    }
-  );
+  resources::Event e =
+      forall<EXEC_POLICY>(dev2, RangeSegment(0, ARRAY_SIZE),
+                          [=] RAJA_HOST_DEVICE(int i) { d_array2[i] = -1; });
 
   dev1.wait_for(&e);
 
-  forall<EXEC_POLICY>(dev1, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      d_array1[i] *= d_array2[i];
-    }
-  );
+  forall<EXEC_POLICY>(dev1, RangeSegment(0, ARRAY_SIZE),
+                      [=] RAJA_HOST_DEVICE(int i)
+                      { d_array1[i] *= d_array2[i]; });
 
   dev1.memcpy(h_array, d_array1, sizeof(int) * ARRAY_SIZE);
 
   dev1.wait();
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0,ARRAY_SIZE),
-    [=] (int i) {
-      ASSERT_EQ(h_array[i], -i); 
-    }
-  );
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
+                                       [=](int i)
+                                       { ASSERT_EQ(h_array[i], -i); });
 
   dev1.deallocate(d_array1);
   dev2.deallocate(d_array2);
   host.deallocate(h_array);
-  
 }
 
 TYPED_TEST_SUITE_P(ResourceDependsTest);
 template <typename T>
 class ResourceDependsTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ResourceDependsTest, ResourceDepends)
 {
@@ -75,7 +64,6 @@ TYPED_TEST_P(ResourceDependsTest, ResourceDepends)
   ResourceDependsTestImpl<WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ResourceDependsTest,
-                            ResourceDepends);
+REGISTER_TYPED_TEST_SUITE_P(ResourceDependsTest, ResourceDepends);
 
 #endif  // __TEST_RESOURCE_DEPENDS_HPP__
diff --git a/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp b/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp
index a8e30d9719..51dc837935 100644
--- a/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp
+++ b/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp
@@ -13,32 +13,28 @@
 template <typename WORKING_RES, typename EXEC_POLICY>
 void ResourceJoinAsyncSemanticsTestImpl()
 {
-  constexpr std::size_t ARRAY_SIZE{1000000};
+  constexpr std::size_t ARRAY_SIZE {1000000};
   using namespace RAJA;
 
   WORKING_RES dev1;
   WORKING_RES dev2;
   resources::Host host;
 
-  int* d_array = resources::Resource{dev1}.allocate<int>(ARRAY_SIZE);
-  int* h_array  = host.allocate<int>(ARRAY_SIZE);
+  int* d_array = resources::Resource {dev1}.allocate<int>(ARRAY_SIZE);
+  int* h_array = host.allocate<int>(ARRAY_SIZE);
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      h_array[i] = i;
-    }
-  );
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
+                                       [=] RAJA_HOST_DEVICE(int i)
+                                       { h_array[i] = i; });
 
   dev2.memcpy(d_array, h_array, sizeof(int) * ARRAY_SIZE);
 
   auto e1 = dev2.get_event_erased();
   dev1.wait_for(&e1);
 
-  RAJA::resources::Event e2 = forall<EXEC_POLICY>(dev1, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      d_array[i] = i + 2;
-    }
-  );
+  RAJA::resources::Event e2 =
+      forall<EXEC_POLICY>(dev1, RangeSegment(0, ARRAY_SIZE),
+                          [=] RAJA_HOST_DEVICE(int i) { d_array[i] = i + 2; });
 
   dev2.wait_for(&e2);
 
@@ -46,22 +42,18 @@ void ResourceJoinAsyncSemanticsTestImpl()
 
   dev2.wait();
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0,ARRAY_SIZE),
-    [=] (int i) {
-      ASSERT_EQ(h_array[i], i + 2); 
-    }
-  );
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
+                                       [=](int i)
+                                       { ASSERT_EQ(h_array[i], i + 2); });
 
   dev1.deallocate(d_array);
   host.deallocate(h_array);
-  
 }
 
 TYPED_TEST_SUITE_P(ResourceJoinAsyncSemanticsTest);
 template <typename T>
 class ResourceJoinAsyncSemanticsTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ResourceJoinAsyncSemanticsTest, ResourceJoinAsyncSemantics)
 {
diff --git a/test/unit/resource/tests/test-resource-MultiStream.hpp b/test/unit/resource/tests/test-resource-MultiStream.hpp
index 7f545229f1..f52fa0a817 100644
--- a/test/unit/resource/tests/test-resource-MultiStream.hpp
+++ b/test/unit/resource/tests/test-resource-MultiStream.hpp
@@ -13,7 +13,7 @@
 template <typename WORKING_RES, typename EXEC_POLICY>
 void ResourceMultiStreamTestImpl()
 {
-  constexpr std::size_t ARRAY_SIZE{10000};
+  constexpr std::size_t ARRAY_SIZE {10000};
   using namespace RAJA;
 
   WORKING_RES dev1;
@@ -21,29 +21,35 @@ void ResourceMultiStreamTestImpl()
   WORKING_RES dev3;
   resources::Host host;
 
-  int* d_array = resources::Resource{dev1}.allocate<int>(ARRAY_SIZE);
-  int* h_array  = host.allocate<int>(ARRAY_SIZE);
-
-  resources::Event e1 = forall<EXEC_POLICY>(dev1, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      if (i % 3 == 0) {
-        d_array[i] = i;
-      }
-  });
-
-  resources::Event e2 = forall<EXEC_POLICY>(dev2, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      if (i % 3 == 1) {
-        d_array[i] = i;
-      }
-  });
-
-  resources::Event e3 = forall<EXEC_POLICY>(dev2, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      if (i % 3 == 2) {
-        d_array[i] = i;
-      }
-  });
+  int* d_array = resources::Resource {dev1}.allocate<int>(ARRAY_SIZE);
+  int* h_array = host.allocate<int>(ARRAY_SIZE);
+
+  resources::Event e1 = forall<EXEC_POLICY>(dev1, RangeSegment(0, ARRAY_SIZE),
+                                            [=] RAJA_HOST_DEVICE(int i)
+                                            {
+                                              if (i % 3 == 0)
+                                              {
+                                                d_array[i] = i;
+                                              }
+                                            });
+
+  resources::Event e2 = forall<EXEC_POLICY>(dev2, RangeSegment(0, ARRAY_SIZE),
+                                            [=] RAJA_HOST_DEVICE(int i)
+                                            {
+                                              if (i % 3 == 1)
+                                              {
+                                                d_array[i] = i;
+                                              }
+                                            });
+
+  resources::Event e3 = forall<EXEC_POLICY>(dev2, RangeSegment(0, ARRAY_SIZE),
+                                            [=] RAJA_HOST_DEVICE(int i)
+                                            {
+                                              if (i % 3 == 2)
+                                              {
+                                                d_array[i] = i;
+                                              }
+                                            });
 
   dev1.wait_for(&e2);
   dev1.wait_for(&e3);
@@ -52,11 +58,9 @@ void ResourceMultiStreamTestImpl()
 
   dev1.wait();
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0,ARRAY_SIZE),
-    [=] (int i) {
-      ASSERT_EQ(h_array[i], i); 
-    }
-  );
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
+                                       [=](int i)
+                                       { ASSERT_EQ(h_array[i], i); });
 
   dev1.deallocate(d_array);
   host.deallocate(h_array);
@@ -65,8 +69,7 @@ void ResourceMultiStreamTestImpl()
 TYPED_TEST_SUITE_P(ResourceMultiStreamTest);
 template <typename T>
 class ResourceMultiStreamTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ResourceMultiStreamTest, ResourceMultiStream)
 {
@@ -76,7 +79,6 @@ TYPED_TEST_P(ResourceMultiStreamTest, ResourceMultiStream)
   ResourceMultiStreamTestImpl<WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ResourceMultiStreamTest,
-                            ResourceMultiStream);
+REGISTER_TYPED_TEST_SUITE_P(ResourceMultiStreamTest, ResourceMultiStream);
 
 #endif  // __TEST_RESOURCE_MULTISTREAM_HPP__
diff --git a/test/unit/util/operator/test-operators-bitwise-modulus.cpp b/test/unit/util/operator/test-operators-bitwise-modulus.cpp
index c2906cbe5f..db81160eb1 100644
--- a/test/unit/util/operator/test-operators-bitwise-modulus.cpp
+++ b/test/unit/util/operator/test-operators-bitwise-modulus.cpp
@@ -12,12 +12,13 @@
 #include "RAJA_test-base.hpp"
 #include "RAJA_unit-test-types.hpp"
 
-template<typename T>
-class OperatorsIntegralUnitTest : public ::testing::Test {};
+template <typename T>
+class OperatorsIntegralUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(OperatorsIntegralUnitTest, UnitExpandedIntegralTypes);
 
-template<typename T>
+template <typename T>
 void modulus_test()
 {
   using Mod = RAJA::operators::modulus<T>;
@@ -25,16 +26,17 @@ void modulus_test()
   Mod m;
   T i = static_cast<T>(5);
   T j = static_cast<T>(2);
-  ASSERT_EQ(m(i,j), T(1));
+  ASSERT_EQ(m(i, j), T(1));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-2);
-    ASSERT_EQ(m(i,j), T(-1));
+    ASSERT_EQ(m(i, j), T(-1));
   }
 }
 
-template<typename T>
+template <typename T>
 void bit_or_test()
 {
   using Or = RAJA::operators::bit_or<T>;
@@ -43,12 +45,12 @@ void bit_or_test()
   T i = static_cast<T>(0010);
   T j = static_cast<T>(0001);
   T k = static_cast<T>(0111);
-  ASSERT_EQ(o(i,j), T(0011));
-  ASSERT_EQ(o(i,k), T(0111));
-  ASSERT_EQ(o(j,k), T(0111));
+  ASSERT_EQ(o(i, j), T(0011));
+  ASSERT_EQ(o(i, k), T(0111));
+  ASSERT_EQ(o(j, k), T(0111));
 }
 
-template<typename T>
+template <typename T>
 void bit_and_test()
 {
   using And = RAJA::operators::bit_and<T>;
@@ -57,12 +59,12 @@ void bit_and_test()
   T i = static_cast<T>(0010);
   T j = static_cast<T>(0001);
   T k = static_cast<T>(0111);
-  ASSERT_EQ(a(i,j), T(0000));
-  ASSERT_EQ(a(i,k), T(0010));
-  ASSERT_EQ(a(j,k), T(0001));
+  ASSERT_EQ(a(i, j), T(0000));
+  ASSERT_EQ(a(i, k), T(0010));
+  ASSERT_EQ(a(j, k), T(0001));
 }
 
-template<typename T>
+template <typename T>
 void bit_xor_test()
 {
   using Xor = RAJA::operators::bit_xor<T>;
@@ -71,12 +73,13 @@ void bit_xor_test()
   T i = static_cast<T>(0010);
   T j = static_cast<T>(0001);
   T k = static_cast<T>(0111);
-  ASSERT_EQ(x(i,j), T(0011));
-  ASSERT_EQ(x(i,k), T(0101));
-  ASSERT_EQ(x(j,k), T(0110));
+  ASSERT_EQ(x(i, j), T(0011));
+  ASSERT_EQ(x(i, k), T(0101));
+  ASSERT_EQ(x(j, k), T(0110));
 }
 
-TYPED_TEST(OperatorsIntegralUnitTest, bitwise_modulus) {
+TYPED_TEST(OperatorsIntegralUnitTest, bitwise_modulus)
+{
   bit_or_test<TypeParam>();
   bit_and_test<TypeParam>();
   bit_xor_test<TypeParam>();
diff --git a/test/unit/util/operator/test-operators-equivalence.cpp b/test/unit/util/operator/test-operators-equivalence.cpp
index f2a0a84c54..710dc21abd 100644
--- a/test/unit/util/operator/test-operators-equivalence.cpp
+++ b/test/unit/util/operator/test-operators-equivalence.cpp
@@ -12,12 +12,13 @@
 #include "RAJA_test-base.hpp"
 #include "RAJA_unit-test-types.hpp"
 
-template<typename T>
-class OperatorsUnitTestEquivalence : public ::testing::Test {};
+template <typename T>
+class OperatorsUnitTestEquivalence : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(OperatorsUnitTestEquivalence, UnitIntFloatTypes);
 
-template<typename T>
+template <typename T>
 void equal_test()
 {
   using Eq = RAJA::operators::equal_to<T>;
@@ -25,16 +26,17 @@ void equal_test()
   Eq eq;
   T i = static_cast<T>(5);
   T j = static_cast<T>(5);
-  ASSERT_TRUE(eq(i,j));
+  ASSERT_TRUE(eq(i, j));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-5);
-    ASSERT_TRUE(eq(i,j));
+    ASSERT_TRUE(eq(i, j));
   }
 }
 
-template<typename T>
+template <typename T>
 void not_equal_test()
 {
   using NEq = RAJA::operators::not_equal_to<T>;
@@ -42,16 +44,17 @@ void not_equal_test()
   NEq neq;
   T i = static_cast<T>(5);
   T j = static_cast<T>(3);
-  ASSERT_TRUE(neq(i,j));
+  ASSERT_TRUE(neq(i, j));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-3);
-    ASSERT_TRUE(neq(i,j));
+    ASSERT_TRUE(neq(i, j));
   }
 }
 
-template<typename T>
+template <typename T>
 void greater_test()
 {
   using G = RAJA::operators::greater<T>;
@@ -59,18 +62,19 @@ void greater_test()
   G g;
   T i = static_cast<T>(5);
   T j = static_cast<T>(4);
-  ASSERT_TRUE(g(i,j));
-  ASSERT_FALSE(g(j,i));
+  ASSERT_TRUE(g(i, j));
+  ASSERT_FALSE(g(j, i));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-4);
     j = static_cast<T>(-5);
-    ASSERT_TRUE(g(i,j));
-    ASSERT_FALSE(g(j,i));
+    ASSERT_TRUE(g(i, j));
+    ASSERT_FALSE(g(j, i));
   }
 }
 
-template<typename T>
+template <typename T>
 void less_test()
 {
   using L = RAJA::operators::less<T>;
@@ -78,64 +82,67 @@ void less_test()
   L l;
   T i = static_cast<T>(4);
   T j = static_cast<T>(5);
-  ASSERT_TRUE(l(i,j));
-  ASSERT_FALSE(l(j,i));
+  ASSERT_TRUE(l(i, j));
+  ASSERT_FALSE(l(j, i));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-4);
-    ASSERT_TRUE(l(i,j));
-    ASSERT_FALSE(l(j,i));
+    ASSERT_TRUE(l(i, j));
+    ASSERT_FALSE(l(j, i));
   }
 }
 
-template<typename T>
+template <typename T>
 void greater_eq_test()
 {
   using G = RAJA::operators::greater_equal<T>;
 
   G g;
-  T i = static_cast<T>(5);
+  T i  = static_cast<T>(5);
   T i2 = static_cast<T>(5);
-  T j = static_cast<T>(4);
-  ASSERT_TRUE(g(i,j));
-  ASSERT_TRUE(g(i,i2));
-  ASSERT_FALSE(g(j,i));
-
-  if (std::is_signed<T>::value) {
-    i = static_cast<T>(-4);
+  T j  = static_cast<T>(4);
+  ASSERT_TRUE(g(i, j));
+  ASSERT_TRUE(g(i, i2));
+  ASSERT_FALSE(g(j, i));
+
+  if (std::is_signed<T>::value)
+  {
+    i  = static_cast<T>(-4);
     i2 = static_cast<T>(-4);
-    j = static_cast<T>(-5);
-    ASSERT_TRUE(g(i,j));
-    ASSERT_TRUE(g(i,i2));
-    ASSERT_FALSE(g(j,i));
+    j  = static_cast<T>(-5);
+    ASSERT_TRUE(g(i, j));
+    ASSERT_TRUE(g(i, i2));
+    ASSERT_FALSE(g(j, i));
   }
 }
 
-template<typename T>
+template <typename T>
 void less_eq_test()
 {
   using L = RAJA::operators::less_equal<T>;
 
   L l;
-  T i = static_cast<T>(4);
+  T i  = static_cast<T>(4);
   T i2 = static_cast<T>(4);
-  T j = static_cast<T>(5);
-  ASSERT_TRUE(l(i,j));
-  ASSERT_TRUE(l(i,i2));
-  ASSERT_FALSE(l(j,i));
-
-  if (std::is_signed<T>::value) {
-    i = static_cast<T>(-5);
+  T j  = static_cast<T>(5);
+  ASSERT_TRUE(l(i, j));
+  ASSERT_TRUE(l(i, i2));
+  ASSERT_FALSE(l(j, i));
+
+  if (std::is_signed<T>::value)
+  {
+    i  = static_cast<T>(-5);
     i2 = static_cast<T>(-5);
-    j = static_cast<T>(-4);
-    ASSERT_TRUE(l(i,j));
-    ASSERT_TRUE(l(i,i2));
-    ASSERT_FALSE(l(j,i));
+    j  = static_cast<T>(-4);
+    ASSERT_TRUE(l(i, j));
+    ASSERT_TRUE(l(i, i2));
+    ASSERT_FALSE(l(j, i));
   }
 }
 
-template<typename T>
+template <typename T>
 void maximum_test()
 {
   using Max = RAJA::operators::maximum<T>;
@@ -143,16 +150,17 @@ void maximum_test()
   Max m;
   T i = static_cast<T>(5);
   T j = static_cast<T>(2);
-  ASSERT_EQ(m(i,j), i);
+  ASSERT_EQ(m(i, j), i);
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-2);
-    ASSERT_EQ(m(i,j), j);
+    ASSERT_EQ(m(i, j), j);
   }
 }
 
-template<typename T>
+template <typename T>
 void minimum_test()
 {
   using Min = RAJA::operators::minimum<T>;
@@ -160,16 +168,18 @@ void minimum_test()
   Min m;
   T i = static_cast<T>(5);
   T j = static_cast<T>(2);
-  ASSERT_EQ(m(i,j), j);
+  ASSERT_EQ(m(i, j), j);
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-2);
-    ASSERT_EQ(m(i,j), i);
+    ASSERT_EQ(m(i, j), i);
   }
 }
 
-TYPED_TEST(OperatorsUnitTestEquivalence, equivalence) {
+TYPED_TEST(OperatorsUnitTestEquivalence, equivalence)
+{
   minimum_test<TypeParam>();
   maximum_test<TypeParam>();
   equal_test<TypeParam>();
diff --git a/test/unit/util/operator/test-operators-identity.cpp b/test/unit/util/operator/test-operators-identity.cpp
index 4b320d1c04..ef7589d05d 100644
--- a/test/unit/util/operator/test-operators-identity.cpp
+++ b/test/unit/util/operator/test-operators-identity.cpp
@@ -12,12 +12,13 @@
 #include "RAJA_test-base.hpp"
 #include "RAJA_unit-test-types.hpp"
 
-template<typename T>
-class OperatorsUnitTestIdentity: public ::testing::Test {};
+template <typename T>
+class OperatorsUnitTestIdentity : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(OperatorsUnitTestIdentity, UnitIntFloatTypes);
 
-template<typename T>
+template <typename T>
 void identity_test()
 {
   using Ident = RAJA::operators::identity<T>;
@@ -28,13 +29,14 @@ void identity_test()
   ASSERT_EQ(id(i), T(0));
   ASSERT_EQ(id(j), T(1));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     j = static_cast<T>(-1);
     ASSERT_EQ(id(j), T(-1));
   }
 }
 
-template<typename T>
+template <typename T>
 void project1st_test()
 {
   using Proj1 = RAJA::operators::project1st<T, T>;
@@ -42,17 +44,18 @@ void project1st_test()
   Proj1 p;
   T i = static_cast<T>(0);
   T j = static_cast<T>(1);
-  ASSERT_EQ(p(i,j), T(0));
-  ASSERT_EQ(p(j,i), T(1));
+  ASSERT_EQ(p(i, j), T(0));
+  ASSERT_EQ(p(j, i), T(1));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     j = static_cast<T>(-1);
-    ASSERT_EQ(p(i,j), T(0));
-    ASSERT_EQ(p(j,i), T(-1));
+    ASSERT_EQ(p(i, j), T(0));
+    ASSERT_EQ(p(j, i), T(-1));
   }
 }
 
-template<typename T>
+template <typename T>
 void project2nd_test()
 {
   using Proj2 = RAJA::operators::project2nd<T, T>;
@@ -60,23 +63,26 @@ void project2nd_test()
   Proj2 p;
   T i = static_cast<T>(0);
   T j = static_cast<T>(1);
-  ASSERT_EQ(p(i,j), T(1));
-  ASSERT_EQ(p(j,i), T(0));
+  ASSERT_EQ(p(i, j), T(1));
+  ASSERT_EQ(p(j, i), T(0));
 
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( disable : 4245 )  // Force msvc to not emit signed conversion warning
+#pragma warning(                                                               \
+    disable : 4245)  // Force msvc to not emit signed conversion warning
 #endif
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     j = static_cast<T>(-1);
-    ASSERT_EQ(p(i,j), T(-1));
-    ASSERT_EQ(p(j,i), T(0));
+    ASSERT_EQ(p(i, j), T(-1));
+    ASSERT_EQ(p(j, i), T(0));
   }
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( default : 4245 )
+#pragma warning(default : 4245)
 #endif
 }
 
-TYPED_TEST(OperatorsUnitTestIdentity, identity_project) {
+TYPED_TEST(OperatorsUnitTestIdentity, identity_project)
+{
   identity_test<TypeParam>();
   project1st_test<TypeParam>();
   project2nd_test<TypeParam>();
diff --git a/test/unit/util/operator/test-operators-logical.cpp b/test/unit/util/operator/test-operators-logical.cpp
index 3fde5644a4..8edb9cdad0 100644
--- a/test/unit/util/operator/test-operators-logical.cpp
+++ b/test/unit/util/operator/test-operators-logical.cpp
@@ -12,11 +12,12 @@
 #include "RAJA_test-base.hpp"
 #include "RAJA_unit-test-types.hpp"
 
-template<typename T>
-class OperatorsUnitTestLogical : public ::testing::Test {};
+template <typename T>
+class OperatorsUnitTestLogical : public ::testing::Test
+{};
 TYPED_TEST_SUITE(OperatorsUnitTestLogical, UnitIntFloatTypes);
 
-template<typename T>
+template <typename T>
 void logical_and_test()
 {
   using And = RAJA::operators::logical_and<T>;
@@ -28,21 +29,22 @@ void logical_and_test()
   T j0 = static_cast<T>(0);
   T j1 = static_cast<T>(1);
   T j2 = static_cast<T>(2);
-  ASSERT_FALSE(a(i0,j0));
-  ASSERT_FALSE(a(i0,j1));
-  ASSERT_FALSE(a(i1,j0));
-  ASSERT_TRUE(a(i1,j1));
-  ASSERT_TRUE(a(i2,j2));
-  if (std::is_signed<T>::value) {
+  ASSERT_FALSE(a(i0, j0));
+  ASSERT_FALSE(a(i0, j1));
+  ASSERT_FALSE(a(i1, j0));
+  ASSERT_TRUE(a(i1, j1));
+  ASSERT_TRUE(a(i2, j2));
+  if (std::is_signed<T>::value)
+  {
     i1 = static_cast<T>(-1);
     j1 = static_cast<T>(-1);
-    ASSERT_FALSE(a(i0,j1));
-    ASSERT_FALSE(a(i1,j0));
-    ASSERT_TRUE(a(i1,j1));
+    ASSERT_FALSE(a(i0, j1));
+    ASSERT_FALSE(a(i1, j0));
+    ASSERT_TRUE(a(i1, j1));
   }
 }
 
-template<typename T>
+template <typename T>
 void logical_or_test()
 {
   using Or = RAJA::operators::logical_or<T>;
@@ -54,21 +56,22 @@ void logical_or_test()
   T j0 = static_cast<T>(0);
   T j1 = static_cast<T>(1);
   T j2 = static_cast<T>(2);
-  ASSERT_FALSE(o(i0,j0));
-  ASSERT_TRUE(o(i0,j1));
-  ASSERT_TRUE(o(i1,j0));
-  ASSERT_TRUE(o(i1,j1));
-  ASSERT_TRUE(o(i2,j2));
-  if (std::is_signed<T>::value) {
+  ASSERT_FALSE(o(i0, j0));
+  ASSERT_TRUE(o(i0, j1));
+  ASSERT_TRUE(o(i1, j0));
+  ASSERT_TRUE(o(i1, j1));
+  ASSERT_TRUE(o(i2, j2));
+  if (std::is_signed<T>::value)
+  {
     i1 = static_cast<T>(-1);
     j1 = static_cast<T>(-1);
-    ASSERT_TRUE(o(i0,j1));
-    ASSERT_TRUE(o(i1,j0));
-    ASSERT_TRUE(o(i1,j1));
+    ASSERT_TRUE(o(i0, j1));
+    ASSERT_TRUE(o(i1, j0));
+    ASSERT_TRUE(o(i1, j1));
   }
 }
 
-template<typename T>
+template <typename T>
 void logical_not_test()
 {
   using Not = RAJA::operators::logical_not<T>;
@@ -78,13 +81,15 @@ void logical_not_test()
   T i1 = static_cast<T>(1);
   ASSERT_FALSE(n(i1));
   ASSERT_TRUE(n(i0));
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i1 = static_cast<T>(-1);
     ASSERT_FALSE(n(i1));
   }
 }
 
-TYPED_TEST(OperatorsUnitTestLogical, logical) {
+TYPED_TEST(OperatorsUnitTestLogical, logical)
+{
   logical_and_test<TypeParam>();
   logical_or_test<TypeParam>();
   logical_not_test<TypeParam>();
diff --git a/test/unit/util/operator/test-operators-math.cpp b/test/unit/util/operator/test-operators-math.cpp
index 054efd41c8..16dd7c170a 100644
--- a/test/unit/util/operator/test-operators-math.cpp
+++ b/test/unit/util/operator/test-operators-math.cpp
@@ -12,11 +12,12 @@
 #include "RAJA_test-base.hpp"
 #include "RAJA_unit-test-types.hpp"
 
-template<typename T>
-class OperatorsUnitTestMath : public ::testing::Test {};
+template <typename T>
+class OperatorsUnitTestMath : public ::testing::Test
+{};
 TYPED_TEST_SUITE(OperatorsUnitTestMath, UnitIntFloatTypes);
 
-template<typename T>
+template <typename T>
 void plus_test()
 {
   using Plus = RAJA::operators::plus<T>;
@@ -26,16 +27,17 @@ void plus_test()
   Plus p;
   T i = static_cast<T>(1);
   T j = static_cast<T>(2);
-  ASSERT_EQ(p(i,j), T(3));
+  ASSERT_EQ(p(i, j), T(3));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-2);
-    ASSERT_EQ(p(i,j), T(-7));
+    ASSERT_EQ(p(i, j), T(-7));
   }
 }
 
-template<typename T>
+template <typename T>
 void minus_test()
 {
   using Minus = RAJA::operators::minus<T>;
@@ -43,16 +45,17 @@ void minus_test()
   Minus m;
   T i = static_cast<T>(5);
   T j = static_cast<T>(2);
-  ASSERT_EQ(m(i,j), T(3));
+  ASSERT_EQ(m(i, j), T(3));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-2);
-    ASSERT_EQ(m(i,j), T(-3));
+    ASSERT_EQ(m(i, j), T(-3));
   }
 }
 
-template<typename T>
+template <typename T>
 void multiplies_test()
 {
   using Mult = RAJA::operators::multiplies<T>;
@@ -62,16 +65,17 @@ void multiplies_test()
   Mult m;
   T i = static_cast<T>(5);
   T j = static_cast<T>(2);
-  ASSERT_EQ(m(i,j), T(10));
+  ASSERT_EQ(m(i, j), T(10));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-2);
-    ASSERT_EQ(m(i,j), T(10));
+    ASSERT_EQ(m(i, j), T(10));
   }
 }
 
-template<typename T>
+template <typename T>
 void divides_test()
 {
   using Div = RAJA::operators::divides<T>;
@@ -79,22 +83,24 @@ void divides_test()
   Div d;
   T i = static_cast<T>(5);
   T j = static_cast<T>(2);
-  if(std::is_floating_point<T>::value) 
-    ASSERT_EQ(d(i,j), T(2.5));
+  if (std::is_floating_point<T>::value)
+    ASSERT_EQ(d(i, j), T(2.5));
   else
-    ASSERT_EQ(d(i,j), T(2));
+    ASSERT_EQ(d(i, j), T(2));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-2);
-    if(std::is_floating_point<T>::value) 
-      ASSERT_EQ(d(i,j), T(2.5));
+    if (std::is_floating_point<T>::value)
+      ASSERT_EQ(d(i, j), T(2.5));
     else
-      ASSERT_EQ(d(i,j), T(2));
+      ASSERT_EQ(d(i, j), T(2));
   }
 }
 
-TYPED_TEST(OperatorsUnitTestMath, math) {
+TYPED_TEST(OperatorsUnitTestMath, math)
+{
   plus_test<TypeParam>();
   minus_test<TypeParam>();
   multiplies_test<TypeParam>();
diff --git a/test/unit/util/test-float-limits.cpp b/test/unit/util/test-float-limits.cpp
index 80635a74e1..d54e454083 100644
--- a/test/unit/util/test-float-limits.cpp
+++ b/test/unit/util/test-float-limits.cpp
@@ -6,12 +6,12 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for floating point numeric limits in 
+/// Source file containing tests for floating point numeric limits in
 /// RAJA operators
 ///
 
 #include "RAJA_test-base.hpp"
-#include "RAJA_unit-test-types.hpp" 
+#include "RAJA_unit-test-types.hpp"
 
 #define RAJA_CHECK_LIMITS
 #include "RAJA/util/Operators.hpp"
@@ -20,8 +20,7 @@
 
 template <typename T>
 class FloatLimitsUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(FloatLimitsUnitTest);
 
diff --git a/test/unit/util/test-fraction.cpp b/test/unit/util/test-fraction.cpp
index 5161b2bb3a..6cc8941184 100644
--- a/test/unit/util/test-fraction.cpp
+++ b/test/unit/util/test-fraction.cpp
@@ -30,16 +30,17 @@ void testFractionMultiplyTypesValues()
             IntegerType(double(numerator) / double(denominator) * double(101)));
 
   // Test where naive algorithm causes overflow, when within precision of double
-  if /*constexpr*/ (sizeof(IntegerType) < sizeof(double)) {
+  if /*constexpr*/ (sizeof(IntegerType) < sizeof(double))
+  {
 
     static constexpr IntegerType max = std::numeric_limits<IntegerType>::max();
-    static constexpr IntegerType val = (numerator > denominator) ?
-        (max / numerator * denominator) : max;
+    static constexpr IntegerType val =
+        (numerator > denominator) ? (max / numerator * denominator) : max;
 
-    ASSERT_EQ(Frac::multiply(IntegerType(val)),
-              IntegerType(double(numerator) / double(denominator) * double(val)));
+    ASSERT_EQ(
+        Frac::multiply(IntegerType(val)),
+        IntegerType(double(numerator) / double(denominator) * double(val)));
   }
-
 }
 
 template <typename IntegerType>
@@ -54,8 +55,8 @@ void testFractionMultiplyTypes()
 }
 
 
-#define RAJA_FRACTION_RUN_TEST(test) \
-  test<int>(); \
+#define RAJA_FRACTION_RUN_TEST(test)                                           \
+  test<int>();                                                                 \
   test<size_t>();
 
 TEST(Fraction, basic_multiply_Fraction)
diff --git a/test/unit/util/test-integral-limits.cpp b/test/unit/util/test-integral-limits.cpp
index 77d2d95bc0..1e68ecc4f4 100644
--- a/test/unit/util/test-integral-limits.cpp
+++ b/test/unit/util/test-integral-limits.cpp
@@ -19,8 +19,7 @@
 
 template <typename T>
 class IntegralLimitsUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(IntegralLimitsUnitTest);
 
@@ -35,5 +34,5 @@ TYPED_TEST_P(IntegralLimitsUnitTest, IntegralLimits)
 REGISTER_TYPED_TEST_SUITE_P(IntegralLimitsUnitTest, IntegralLimits);
 
 INSTANTIATE_TYPED_TEST_SUITE_P(IntegralLimitsUnitTests,
-                              IntegralLimitsUnitTest,
-                              UnitIntegralTypes);
+                               IntegralLimitsUnitTest,
+                               UnitIntegralTypes);
diff --git a/test/unit/util/test-math.cpp b/test/unit/util/test-math.cpp
index 39572ad3a0..dd5b5dbc24 100644
--- a/test/unit/util/test-math.cpp
+++ b/test/unit/util/test-math.cpp
@@ -13,7 +13,7 @@
 #include "RAJA_gtest.hpp"
 #include <type_traits>
 
-template < typename T >
+template <typename T>
 void test_log2()
 {
   ASSERT_EQ(RAJA::log2(T(257)), T(8));
@@ -24,7 +24,8 @@ void test_log2()
   ASSERT_EQ(RAJA::log2(T(2)), T(1));
   ASSERT_EQ(RAJA::log2(T(1)), T(0));
   ASSERT_EQ(RAJA::log2(T(0)), T(0));
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     ASSERT_EQ(RAJA::log2(T(-1)), T(0));
     ASSERT_EQ(RAJA::log2(T(-100)), T(0));
   }
@@ -37,7 +38,7 @@ TEST(math, log2)
 }
 
 
-template < typename T >
+template <typename T>
 void test_next_pow2()
 {
   ASSERT_EQ(RAJA::next_pow2(T(257)), T(512));
@@ -48,7 +49,8 @@ void test_next_pow2()
   ASSERT_EQ(RAJA::next_pow2(T(2)), T(2));
   ASSERT_EQ(RAJA::next_pow2(T(1)), T(1));
   ASSERT_EQ(RAJA::next_pow2(T(0)), T(0));
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     ASSERT_EQ(RAJA::next_pow2(T(-1)), T(0));
     ASSERT_EQ(RAJA::next_pow2(T(-100)), T(0));
   }
@@ -61,7 +63,7 @@ TEST(math, next_pow2)
 }
 
 
-template < typename T >
+template <typename T>
 void test_prev_pow2()
 {
   ASSERT_EQ(RAJA::prev_pow2(T(257)), T(256));
@@ -72,7 +74,8 @@ void test_prev_pow2()
   ASSERT_EQ(RAJA::prev_pow2(T(2)), T(2));
   ASSERT_EQ(RAJA::prev_pow2(T(1)), T(1));
   ASSERT_EQ(RAJA::prev_pow2(T(0)), T(0));
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     ASSERT_EQ(RAJA::prev_pow2(T(-1)), T(0));
     ASSERT_EQ(RAJA::prev_pow2(T(-100)), T(0));
   }
@@ -85,7 +88,7 @@ TEST(math, prev_pow2)
 }
 
 
-template < typename T >
+template <typename T>
 void test_power_of_2_mod()
 {
   ASSERT_EQ(RAJA::power_of_2_mod(T(257), T(256)), T(1));
diff --git a/test/unit/util/test-span.cpp b/test/unit/util/test-span.cpp
index e59054cfc6..77ba2c347a 100644
--- a/test/unit/util/test-span.cpp
+++ b/test/unit/util/test-span.cpp
@@ -11,41 +11,24 @@
 
 #include "test-span.hpp"
 
-#define RAJA_SPAN_RUN_TEST(test) \
-  test<int, int>(); \
-  test<int, size_t>(); \
-  test<double, int>(); \
-  test<double, size_t>(); \
+#define RAJA_SPAN_RUN_TEST(test)                                               \
+  test<int, int>();                                                            \
+  test<int, size_t>();                                                         \
+  test<double, int>();                                                         \
+  test<double, size_t>();
 
-TEST(Span, basic_construct_Span)
-{
-  RAJA_SPAN_RUN_TEST(testSpanConstructTypes)
-}
+TEST(Span, basic_construct_Span) {RAJA_SPAN_RUN_TEST(testSpanConstructTypes)}
 
-TEST(Span, basic_assign_Span)
-{
-  RAJA_SPAN_RUN_TEST(testSpanAssignTypes)
-}
+TEST(Span, basic_assign_Span) {RAJA_SPAN_RUN_TEST(testSpanAssignTypes)}
 
-TEST(Span, basic_iterator_Span)
-{
-  RAJA_SPAN_RUN_TEST(testSpanIteratorTypes)
-}
+TEST(Span, basic_iterator_Span) {RAJA_SPAN_RUN_TEST(testSpanIteratorTypes)}
 
-TEST(Span, basic_element_access_Span)
-{
-  RAJA_SPAN_RUN_TEST(testSpanElementAccessTypes)
-}
+TEST(Span,
+     basic_element_access_Span) {RAJA_SPAN_RUN_TEST(testSpanElementAccessTypes)}
 
-TEST(Span, basic_observe_Span)
-{
-  RAJA_SPAN_RUN_TEST(testSpanObserveTypes)
-}
+TEST(Span, basic_observe_Span) {RAJA_SPAN_RUN_TEST(testSpanObserveTypes)}
 
-TEST(Span, basic_subview_Span)
-{
-  RAJA_SPAN_RUN_TEST(testSpanSubViewTypes)
-}
+TEST(Span, basic_subview_Span) {RAJA_SPAN_RUN_TEST(testSpanSubViewTypes)}
 
 TEST(Span, basic_make_span_Span)
 {
diff --git a/test/unit/util/test-span.hpp b/test/unit/util/test-span.hpp
index e76db861fd..b6fff3fe90 100644
--- a/test/unit/util/test-span.hpp
+++ b/test/unit/util/test-span.hpp
@@ -26,7 +26,7 @@ template <typename ValueType, typename IndexType>
 void testSpanConstructTypes()
 {
   constexpr IndexType len = 4;
-  ValueType* ptr = new ValueType[len];
+  ValueType* ptr          = new ValueType[len];
 
   {
     const RAJA::Span<ValueType*, IndexType> span(ptr, len);
@@ -36,7 +36,7 @@ void testSpanConstructTypes()
   }
 
   {
-    const RAJA::Span<ValueType*, IndexType> span(ptr, ptr+len);
+    const RAJA::Span<ValueType*, IndexType> span(ptr, ptr + len);
 
     ASSERT_EQ(ptr, span.data());
     ASSERT_EQ(len, span.size());
@@ -49,7 +49,7 @@ template <typename ValueType, typename IndexType>
 void testSpanAssignTypes()
 {
   constexpr IndexType len = 4;
-  ValueType* ptr = new ValueType[len];
+  ValueType* ptr          = new ValueType[len];
 
   {
     RAJA::Span<ValueType*, IndexType> span(ptr, len);
@@ -61,7 +61,7 @@ void testSpanAssignTypes()
   }
 
   {
-    ValueType* ptr2 = ptr + 1;
+    ValueType* ptr2          = ptr + 1;
     constexpr IndexType len2 = 1;
     RAJA::Span<ValueType*, IndexType> span(ptr, len);
     const RAJA::Span<ValueType*, IndexType> span2(ptr2, len2);
@@ -77,15 +77,15 @@ void testSpanAssignTypes()
 template <typename ValueType, typename IndexType>
 void testSpanIteratorTypes()
 {
-  using span_type = RAJA::Span<ValueType*, IndexType>;
-  using iterator = typename span_type::iterator;
-  using const_iterator = typename span_type::const_iterator;
+  using span_type         = RAJA::Span<ValueType*, IndexType>;
+  using iterator          = typename span_type::iterator;
+  using const_iterator    = typename span_type::const_iterator;
   constexpr IndexType len = 4;
-  ValueType* ptr = new ValueType[len];
+  ValueType* ptr          = new ValueType[len];
 
   // XL cannot handle initialization list with new
   // e.g. new ValueType[len]{0,1,2,3} produces error
-  for ( IndexType ii = 0; ii < len; ++ii )
+  for (IndexType ii = 0; ii < len; ++ii)
   {
     ptr[ii] = static_cast<ValueType>(ii);
   }
@@ -94,27 +94,29 @@ void testSpanIteratorTypes()
     const span_type span(ptr, len);
 
     iterator begin = span.begin();
-    iterator end = span.end();
+    iterator end   = span.end();
     ASSERT_EQ(ptr, begin);
-    ASSERT_EQ(ptr+len, end);
+    ASSERT_EQ(ptr + len, end);
 
     ValueType* ptr_chk = ptr;
 
-    for (iterator iter = begin; iter != end; ++iter) {
+    for (iterator iter = begin; iter != end; ++iter)
+    {
       ASSERT_EQ(*ptr_chk, *iter);
-      ptr_chk++ ;
+      ptr_chk++;
     }
 
     const_iterator cbegin = span.cbegin();
-    const_iterator cend = span.cend();
+    const_iterator cend   = span.cend();
     ASSERT_EQ(ptr, cbegin);
-    ASSERT_EQ(ptr+len, cend);
+    ASSERT_EQ(ptr + len, cend);
 
     ptr_chk = ptr;
 
-    for (iterator citer = cbegin; citer != cend; ++citer) {
+    for (iterator citer = cbegin; citer != cend; ++citer)
+    {
       ASSERT_EQ(*ptr_chk, *citer);
-      ptr_chk++ ;
+      ptr_chk++;
     }
   }
 
@@ -125,11 +127,11 @@ template <typename ValueType, typename IndexType>
 void testSpanElementAccessTypes()
 {
   constexpr IndexType len = 4;
-  ValueType* ptr = new ValueType[len];
+  ValueType* ptr          = new ValueType[len];
 
   // XL cannot handle initialization list with new
   // e.g. new ValueType[len]{0,1,2,3} produces error
-  for ( IndexType ii = 0; ii < len; ++ii )
+  for (IndexType ii = 0; ii < len; ++ii)
   {
     ptr[ii] = static_cast<ValueType>(ii);
   }
@@ -139,9 +141,10 @@ void testSpanElementAccessTypes()
 
     ASSERT_EQ(ptr, span.data());
     ASSERT_EQ(*ptr, span.front());
-    ASSERT_EQ(*(ptr+len-1), span.back());
+    ASSERT_EQ(*(ptr + len - 1), span.back());
 
-    for (IndexType i = 0; i < len; ++i) {
+    for (IndexType i = 0; i < len; ++i)
+    {
       ASSERT_EQ(ptr[i], span[i]);
     }
   }
@@ -153,11 +156,11 @@ template <typename ValueType, typename IndexType>
 void testSpanObserveTypes()
 {
   constexpr IndexType len = 4;
-  ValueType* ptr = new ValueType[len];
+  ValueType* ptr          = new ValueType[len];
 
   // XL cannot handle initialization list with new
   // e.g. new ValueType[len]{0,1,2,3} produces error
-  for ( IndexType ii = 0; ii < len; ++ii )
+  for (IndexType ii = 0; ii < len; ++ii)
   {
     ptr[ii] = static_cast<ValueType>(ii);
   }
@@ -170,7 +173,7 @@ void testSpanObserveTypes()
   }
 
   {
-    const RAJA::Span<ValueType*, IndexType> span(ptr, len-len);
+    const RAJA::Span<ValueType*, IndexType> span(ptr, len - len);
 
     ASSERT_EQ(0, span.size());
     ASSERT_TRUE(span.empty());
@@ -183,11 +186,11 @@ template <typename ValueType, typename IndexType>
 void testSpanSubViewTypes()
 {
   constexpr IndexType len = 4;
-  ValueType* ptr = new ValueType[len];
+  ValueType* ptr          = new ValueType[len];
 
   // XL cannot handle initialization list with new
   // e.g. new ValueType[len]{0,1,2,3} produces error
-  for ( IndexType ii = 0; ii < len; ++ii )
+  for (IndexType ii = 0; ii < len; ++ii)
   {
     ptr[ii] = static_cast<ValueType>(ii);
   }
@@ -207,17 +210,18 @@ void testSpanSubViewTypes()
     const RAJA::Span<ValueType*, IndexType> subspan = span.last(count);
 
     ASSERT_EQ(count, subspan.size());
-    ASSERT_EQ(ptr+len-count, subspan.data());
+    ASSERT_EQ(ptr + len - count, subspan.data());
   }
 
   {
     constexpr IndexType begin = 1;
     constexpr IndexType count = 2;
     const RAJA::Span<ValueType*, IndexType> span(ptr, len);
-    const RAJA::Span<ValueType*, IndexType> subspan = span.subspan(begin, count);
+    const RAJA::Span<ValueType*, IndexType> subspan =
+        span.subspan(begin, count);
 
     ASSERT_EQ(count, subspan.size());
-    ASSERT_EQ(ptr+begin, subspan.data());
+    ASSERT_EQ(ptr + begin, subspan.data());
   }
 
   {
@@ -227,7 +231,7 @@ void testSpanSubViewTypes()
     const RAJA::Span<ValueType*, IndexType> subspan = span.slice(begin, count);
 
     ASSERT_EQ(count, subspan.size());
-    ASSERT_EQ(ptr+begin, subspan.data());
+    ASSERT_EQ(ptr + begin, subspan.data());
   }
 
   delete[] ptr;
@@ -237,7 +241,7 @@ template <typename ValueType, typename IndexType>
 void testSpanMakeSpanTypes()
 {
   constexpr IndexType len = 4;
-  ValueType* ptr = new ValueType[len];
+  ValueType* ptr          = new ValueType[len];
 
   {
     const RAJA::Span<ValueType*, IndexType> span = RAJA::make_span(ptr, len);
diff --git a/test/unit/util/test-timer.cpp b/test/unit/util/test-timer.cpp
index 1688e6497e..ed4ed599ae 100644
--- a/test/unit/util/test-timer.cpp
+++ b/test/unit/util/test-timer.cpp
@@ -51,7 +51,8 @@ TEST(TimerUnitTest, No2)
 
   timer.start("test_timer");
 
-  for (int i = 2; i > 0; --i) {
+  for (int i = 2; i > 0; --i)
+  {
     std::cout << i << std::endl;
     std::this_thread::sleep_for(std::chrono::milliseconds(10));
   }
@@ -73,7 +74,8 @@ TEST(TimerUnitTest, No3)
 
   timer.start("test_timer");
 
-  for (int i = 2; i > 0; --i) {
+  for (int i = 2; i > 0; --i)
+  {
     std::cout << i << std::endl;
     std::this_thread::sleep_for(std::chrono::milliseconds(10));
   }
@@ -95,5 +97,5 @@ TEST(TimerUnitTest, No3)
   std::this_thread::sleep_for(std::chrono::milliseconds(10));
   timer.stop();
   elapsed = timer.elapsed();
-  EXPECT_GT(elapsed, 0.01); 
+  EXPECT_GT(elapsed, 0.01);
 }
diff --git a/test/unit/view-layout/test-indexlayout.cpp b/test/unit/view-layout/test-indexlayout.cpp
index bd7effa8d4..048c91a641 100644
--- a/test/unit/view-layout/test-indexlayout.cpp
+++ b/test/unit/view-layout/test-indexlayout.cpp
@@ -11,97 +11,99 @@
 
 using namespace RAJA;
 
-TEST(IndexLayout, IndexList1D) {
+TEST(IndexLayout, IndexList1D)
+{
   /*
    * Construct a 1D index layout with the index list {1,2,3}
    */
 
-  Index_type arr[3] = {1,2,3};
+  Index_type arr[3] = {1, 2, 3};
 
-  auto index_tuple = make_index_tuple(IndexList<>{&arr[0]});
+  auto index_tuple  = make_index_tuple(IndexList<> {&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 3);
 
   EXPECT_EQ(index_layout(0), 1);
   EXPECT_EQ(index_layout(1), 2);
   EXPECT_EQ(index_layout(2), 3);
-
 }
 
-TEST(IndexLayout, IndexList1DSubsetOfLayout) {
+TEST(IndexLayout, IndexList1DSubsetOfLayout)
+{
   /*
-   * Construct a 1D index layout of arbitrary size greater than 3 
+   * Construct a 1D index layout of arbitrary size greater than 3
    * with the index list {2,3,4}.
    * The purpose of this test is to demonstrate the use case where
    * the index list contains a subset of its index layout
    */
 
-  Index_type arr[3] = {2,3,4};
+  Index_type arr[3] = {2, 3, 4};
 
-  auto index_tuple = make_index_tuple(IndexList<>{&arr[0]});
+  auto index_tuple  = make_index_tuple(IndexList<> {&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 5);
 
   EXPECT_EQ(index_layout(0), 2);
   EXPECT_EQ(index_layout(1), 3);
   EXPECT_EQ(index_layout(2), 4);
-
 }
 
 
-TEST(IndexLayout, ExtractTwoIndices2DLayoutAxis0) {
+TEST(IndexLayout, ExtractTwoIndices2DLayoutAxis0)
+{
   /*
-   * Construct a 2D index layout of size 3x10 with 
+   * Construct a 2D index layout of size 3x10 with
    * the index list {1,2} used along the 0-axis and
    * the direct index used along the 1-axis
-   * Examples: 
+   * Examples:
    *   (index layout index -> regular layout index -> unit stride index)
    *   index_layout(0,1)   -> layout(1,1)          -> 11
    *   index_layout(0,5)   -> layout(1,5)          -> 15
    *   index_layout(1,7)   -> layout(2,7)          -> 27
    */
 
-  Index_type arr[2] = {1,2};
+  Index_type arr[2] = {1, 2};
 
-  auto index_tuple = make_index_tuple(IndexList<>{&arr[0]}, DirectIndex<>());
+  auto index_tuple  = make_index_tuple(IndexList<> {&arr[0]}, DirectIndex<>());
   auto index_layout = make_index_layout(index_tuple, 3, 10);
 
-  for (int i = 0; i < 10; i++ ) {
-    EXPECT_EQ(index_layout(0,i), i+10);
-    EXPECT_EQ(index_layout(1,i), i+20);
+  for (int i = 0; i < 10; i++)
+  {
+    EXPECT_EQ(index_layout(0, i), i + 10);
+    EXPECT_EQ(index_layout(1, i), i + 20);
   }
-
 }
 
-TEST(IndexLayout, ExtractTwoIndices2DLayoutAxis1) {
+TEST(IndexLayout, ExtractTwoIndices2DLayoutAxis1)
+{
   /*
-   * Construct a 2D index layout of size 3x10 with 
+   * Construct a 2D index layout of size 3x10 with
    * the direct index used along the 0-axis and
    * the index list {9,5} used along the 1-axis
-   * Examples: 
+   * Examples:
    *   (index layout index -> regular layout index -> unit stride index)
    *   index_layout(0,1)   -> layout(0,5)          -> 5
    *   index_layout(2,0)   -> layout(2,9)          -> 29
    */
 
-  Index_type arr[2] = {9,5};
+  Index_type arr[2] = {9, 5};
 
-  auto index_tuple = make_index_tuple(DirectIndex<>(), IndexList<>{&arr[0]});
+  auto index_tuple  = make_index_tuple(DirectIndex<>(), IndexList<> {&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 3, 10);
 
-  EXPECT_EQ(index_layout(0,0), 9);
-  EXPECT_EQ(index_layout(0,1), 5);
-  EXPECT_EQ(index_layout(1,0), 19);
-  EXPECT_EQ(index_layout(1,1), 15);
-  EXPECT_EQ(index_layout(2,0), 29);
-  EXPECT_EQ(index_layout(2,1), 25);
-
+  EXPECT_EQ(index_layout(0, 0), 9);
+  EXPECT_EQ(index_layout(0, 1), 5);
+  EXPECT_EQ(index_layout(1, 0), 19);
+  EXPECT_EQ(index_layout(1, 1), 15);
+  EXPECT_EQ(index_layout(2, 0), 29);
+  EXPECT_EQ(index_layout(2, 1), 25);
 }
 
-TEST(IndexLayout, ExtractOneIndex2DLayoutAxis0) {
+TEST(IndexLayout, ExtractOneIndex2DLayoutAxis0)
+{
   /*
-   * Construct a 2D index layout of size 3x3 with 
+   * Construct a 2D index layout of size 3x3 with
    * the index list {2} used along the 0-axis and
    * the direct index used along the 1-axis
-   * Examples: 
+   * Examples:
    *   (index layout index -> regular layout index -> unit stride index)
    *   index_layout(0,1)   -> layout(2,1)          -> 7
    *   index_layout(0,2)   -> layout(2,2)          -> 8
@@ -109,21 +111,21 @@ TEST(IndexLayout, ExtractOneIndex2DLayoutAxis0) {
 
   Index_type arr[1] = {2};
 
-  auto index_tuple = make_index_tuple(IndexList<>{&arr[0]}, DirectIndex<>());
+  auto index_tuple  = make_index_tuple(IndexList<> {&arr[0]}, DirectIndex<>());
   auto index_layout = make_index_layout(index_tuple, 3, 3);
 
-  EXPECT_EQ(index_layout(0,0), 6);
-  EXPECT_EQ(index_layout(0,1), 7);
-  EXPECT_EQ(index_layout(0,2), 8);  
-
+  EXPECT_EQ(index_layout(0, 0), 6);
+  EXPECT_EQ(index_layout(0, 1), 7);
+  EXPECT_EQ(index_layout(0, 2), 8);
 }
 
-TEST(IndexLayout, IndexList2DLayoutExtractOneIndex) {
+TEST(IndexLayout, IndexList2DLayoutExtractOneIndex)
+{
   /*
-   * Construct a 2D index layout of size 3x3 with 
+   * Construct a 2D index layout of size 3x3 with
    * the direct index used along the 0-axis and
    * the index list {2} used along the 1-axis
-   * Examples: 
+   * Examples:
    *   (index layout index -> regular layout index -> unit stride index)
    *   index_layout(1,0)   -> layout(1,2)          -> 5
    *   index_layout(2,0)   -> layout(2,2)          -> 8
@@ -131,21 +133,21 @@ TEST(IndexLayout, IndexList2DLayoutExtractOneIndex) {
 
   Index_type arr[1] = {2};
 
-  auto index_tuple = make_index_tuple(DirectIndex<>(), IndexList<>{&arr[0]});
+  auto index_tuple  = make_index_tuple(DirectIndex<>(), IndexList<> {&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 3, 3);
 
-  EXPECT_EQ(index_layout(0,0), 2);
-  EXPECT_EQ(index_layout(1,0), 5);
-  EXPECT_EQ(index_layout(2,0), 8);
-
+  EXPECT_EQ(index_layout(0, 0), 2);
+  EXPECT_EQ(index_layout(1, 0), 5);
+  EXPECT_EQ(index_layout(2, 0), 8);
 }
 
-TEST(IndexLayout, ConditionalIndexListNullPtr) {
+TEST(IndexLayout, ConditionalIndexListNullPtr)
+{
   /*
-   * Construct a 1D index layout of size 3 with 
+   * Construct a 1D index layout of size 3 with
    * the conditional index list that is a nullptr
    * (conditional index lists always evaluate nullptr to regular indexing)
-   * Examples: 
+   * Examples:
    *   (index layout index -> regular layout index -> unit stride index)
    *   index_layout(0)     -> layout(0)            -> 0
    *   index_layout(2)     -> layout(2)            -> 2
@@ -153,7 +155,7 @@ TEST(IndexLayout, ConditionalIndexListNullPtr) {
 
   Index_type* arr_ptr = nullptr;
 
-  auto index_tuple = make_index_tuple(ConditionalIndexList<>{arr_ptr});
+  auto index_tuple  = make_index_tuple(ConditionalIndexList<> {arr_ptr});
   auto index_layout = make_index_layout(index_tuple, 3);
 
   EXPECT_EQ(index_layout(0), 0);
@@ -161,20 +163,21 @@ TEST(IndexLayout, ConditionalIndexListNullPtr) {
   EXPECT_EQ(index_layout(2), 2);
 }
 
-TEST(IndexLayout, ConditionalIndexListWithIndexList) {
+TEST(IndexLayout, ConditionalIndexListWithIndexList)
+{
   /*
-   * Construct a 1D index layout of size 3 with 
+   * Construct a 1D index layout of size 3 with
    * the conditional index list that is not a nullptr
    * (conditional index lists with index list act the same as IndexList)
-   * Examples: 
+   * Examples:
    *   (index layout index -> regular layout index -> unit stride index)
    *   index_layout(0)     -> layout(1)            -> 1
    *   index_layout(1)     -> layout(2)            -> 2
    */
 
-  Index_type arr[2] = {1,2};
+  Index_type arr[2] = {1, 2};
 
-  auto index_tuple = make_index_tuple(ConditionalIndexList<>{&arr[0]});
+  auto index_tuple  = make_index_tuple(ConditionalIndexList<> {&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 3);
 
   EXPECT_EQ(index_layout(0), 1);
@@ -184,18 +187,18 @@ TEST(IndexLayout, ConditionalIndexListWithIndexList) {
 TEST(IndexLayout, View1DLayout)
 {
   /*
-   * Construct a 1D index layout of size 5 with 
+   * Construct a 1D index layout of size 5 with
    * the index list {4,2,3} and pass to a 1D view with the data {5,10,15,20,25}
-   * Examples: 
-   *   (index layout index -> regular layout index -> unit stride index -> view at index)
-   *   index_layout(0)     -> layout(4)            -> 4                 -> 25
+   * Examples:
+   *   (index layout index -> regular layout index -> unit stride index -> view
+   * at index) index_layout(0)     -> layout(4)            -> 4 -> 25
    *   index_layout(2)     -> layout(3)            -> 3                 -> 20
    */
-  
-  Index_type data[5] = {5,10,15,20,25};
-  Index_type index_list[3] = {4,2,3};
 
-  auto index_tuple = make_index_tuple(IndexList<>{&index_list[0]});
+  Index_type data[5]       = {5, 10, 15, 20, 25};
+  Index_type index_list[3] = {4, 2, 3};
+
+  auto index_tuple  = make_index_tuple(IndexList<> {&index_list[0]});
   auto index_layout = make_index_layout(index_tuple, 5);
 
   auto view = make_index_view(&data[0], index_layout);
@@ -203,18 +206,17 @@ TEST(IndexLayout, View1DLayout)
   EXPECT_EQ(view(0), 25);
   EXPECT_EQ(view(1), 15);
   EXPECT_EQ(view(2), 20);
-
 }
 
 TEST(IndexLayout, View2DLayout)
 {
   /*
-   * Construct a 2D index layout of size 2x3 with 
+   * Construct a 2D index layout of size 2x3 with
    * the direct index used along the 0-axis and
    * the index list {1,2} used along the 1-axis and
    * pass to a 2D view of size 2x3 with the each entry being i*j
    * for i,j in [0,2)x[0,3) (e.g. view(1,2) = 1*2, view(0,2) = 0*2, etc..)
-   * Examples: 
+   * Examples:
    *   (index layout index -> view index -> view at index)
    *   index_layout(0,1)   -> view(0,2)  -> 0
    *   index_layout(1,0)   -> view(1,1)  -> 1
@@ -222,112 +224,121 @@ TEST(IndexLayout, View2DLayout)
 
   Index_type data[2][3];
 
-  for (int i = 0; i < 2; i ++ ) {
-    for (int j = 0; j < 3; j ++ ) {
-      data[i][j] = i*j;
+  for (int i = 0; i < 2; i++)
+  {
+    for (int j = 0; j < 3; j++)
+    {
+      data[i][j] = i * j;
     }
   }
 
-  Index_type index_list[2] = {1,2};
+  Index_type index_list[2] = {1, 2};
 
-  auto index_tuple = make_index_tuple(DirectIndex<>(), IndexList<>{&index_list[0]});
+  auto index_tuple =
+      make_index_tuple(DirectIndex<>(), IndexList<> {&index_list[0]});
   auto index_layout = make_index_layout(index_tuple, 2, 3);
 
   auto view = make_index_view(&data[0][0], index_layout);
 
-  for (int i = 0; i < 2; i ++ ) {
-    for (int j = 0; j < 2; j ++ ) {
-      EXPECT_EQ(view(i,j), i*(j+1));
+  for (int i = 0; i < 2; i++)
+  {
+    for (int j = 0; j < 2; j++)
+    {
+      EXPECT_EQ(view(i, j), i * (j + 1));
     }
   }
-
 }
 
 TEST(IndexLayout, View3DLayout)
 {
   /*
-   * Construct a 3D index layout of size 2x3x4 with 
+   * Construct a 3D index layout of size 2x3x4 with
    * the direct index used along the 0-axis and
    * the index list {1,2} used along the 1-axis and
    * the index list {2,3} used along the 2-axis and
    * pass to a 3D view of size 2x3x4 with the each entry being i*j*k
-   * for i,j,k in [0,2)x[0,3)x[0,4) (e.g. view(1,2,3) = 1*2*3, view(0,2,2) = 0*2*2, etc..)
-   * Examples: 
-   *   (index layout index -> view index -> view at index)
+   * for i,j,k in [0,2)x[0,3)x[0,4) (e.g. view(1,2,3) = 1*2*3, view(0,2,2) =
+   * 0*2*2, etc..) Examples: (index layout index -> view index -> view at index)
    *   index_layout(0,1,0) -> view(0,2,2)-> 0
    *   index_layout(2,1,1) -> view(2,2,3)-> 12
    */
-  
+
   Index_type data[2][3][4];
 
-  for (int i = 0; i < 2; i ++ ) {
-    for (int j = 0; j < 3; j ++ ) {
-      for (int k = 0; k < 4; k ++ ) {
-	data[i][j][k] = i*j*k;
+  for (int i = 0; i < 2; i++)
+  {
+    for (int j = 0; j < 3; j++)
+    {
+      for (int k = 0; k < 4; k++)
+      {
+        data[i][j][k] = i * j * k;
       }
     }
   }
 
-  Index_type index_list_j[2] = {1,2};
-  Index_type index_list_k[2] = {2,3};
+  Index_type index_list_j[2] = {1, 2};
+  Index_type index_list_k[2] = {2, 3};
 
-  auto index_tuple = make_index_tuple(DirectIndex<>(), 
-                                      IndexList<>{&index_list_j[0]},
-                                      IndexList<>{&index_list_k[0]});
+  auto index_tuple =
+      make_index_tuple(DirectIndex<>(), IndexList<> {&index_list_j[0]},
+                       IndexList<> {&index_list_k[0]});
 
   auto index_layout = make_index_layout(index_tuple, 2, 3, 4);
 
   auto view = make_index_view(&data[0][0][0], index_layout);
 
-  for (int i = 0; i < 2; i ++ ) {
-    for (int j = 0; j < 2; j ++ ) {
-      for (int k = 0; k < 2; k ++ ) {
-        EXPECT_EQ(view(i,j,k), i*(j+1)*(k+2));
+  for (int i = 0; i < 2; i++)
+  {
+    for (int j = 0; j < 2; j++)
+    {
+      for (int k = 0; k < 2; k++)
+      {
+        EXPECT_EQ(view(i, j, k), i * (j + 1) * (k + 2));
       }
     }
   }
-
 }
 
 TEST(IndexLayout, MultiView1DLayout)
 {
   /*
-   * Construct a 1D index layout of size 4 with 
-   * the index list {1,2} and pass to a 1D multiview containing two 1D views of size 4 with
-   * the first view having each entry be the square of its index (e.g. view(2) = 2*2 = 4)
-   * and the second view having each entry be the cube of its index (e.g. view(3) = 3*3*3 = 27)
-   * Examples: 
-   *   (index layout index -> mutiview index -> view at index)
-   *   index_layout(0,1)   -> view(0,2)      -> 4
+   * Construct a 1D index layout of size 4 with
+   * the index list {1,2} and pass to a 1D multiview containing two 1D views of
+   * size 4 with the first view having each entry be the square of its index
+   * (e.g. view(2) = 2*2 = 4) and the second view having each entry be the cube
+   * of its index (e.g. view(3) = 3*3*3 = 27) Examples: (index layout index ->
+   * mutiview index -> view at index) index_layout(0,1)   -> view(0,2)      -> 4
    *   index_layout(1,0)   -> view(1,1)      -> 1
    */
 
   Index_type data_squared[4];
   Index_type data_cubed[4];
 
-  for (int i = 0; i < 4; i ++ ) {
-    data_squared[i] = i*i;
+  for (int i = 0; i < 4; i++)
+  {
+    data_squared[i] = i * i;
   }
-  
-  for (int i = 0; i < 4; i ++ ) {
-    data_cubed[i] = i*i*i;
+
+  for (int i = 0; i < 4; i++)
+  {
+    data_cubed[i] = i * i * i;
   }
 
   Index_type* data_array[2];
   data_array[0] = data_squared;
   data_array[1] = data_cubed;
 
-  Index_type index_list[2] = {1,2};
+  Index_type index_list[2] = {1, 2};
 
-  auto index_tuple = make_index_tuple(IndexList<>{&index_list[0]});
+  auto index_tuple  = make_index_tuple(IndexList<> {&index_list[0]});
   auto index_layout = make_index_layout(index_tuple, 4);
 
-  auto view = MultiView<Index_type, IndexLayout<1, Index_type, IndexList<> > >(data_array, index_layout);
+  auto view = MultiView<Index_type, IndexLayout<1, Index_type, IndexList<>>>(
+      data_array, index_layout);
 
-  for (int i = 0; i < 2; i ++ ) {
-    EXPECT_EQ(view(0,i), data_squared[i+1]);
-    EXPECT_EQ(view(1,i), data_cubed[i+1]);
+  for (int i = 0; i < 2; i++)
+  {
+    EXPECT_EQ(view(0, i), data_squared[i + 1]);
+    EXPECT_EQ(view(1, i), data_cubed[i + 1]);
   }
-
 }
-
diff --git a/test/unit/view-layout/test-makelayout.cpp b/test/unit/view-layout/test-makelayout.cpp
index af8b6db71e..a1377ddc26 100644
--- a/test/unit/view-layout/test-makelayout.cpp
+++ b/test/unit/view-layout/test-makelayout.cpp
@@ -9,19 +9,18 @@
 
 TEST(LayoutUnitTest, OffsetVsRegular)
 {
-  const auto layout =
-      RAJA::make_permuted_layout({{6, 6}},
-                                 RAJA::as_array<RAJA::Perm<1, 0>>::get());
-  const auto offset =
-      RAJA::make_permuted_offset_layout({{0, 0}},
-                                        {{6, 6}},
-                                        RAJA::as_array<RAJA::PERM_JI>::get());
+  const auto layout = RAJA::make_permuted_layout(
+      {{6, 6}}, RAJA::as_array<RAJA::Perm<1, 0>>::get());
+  const auto offset = RAJA::make_permuted_offset_layout(
+      {{0, 0}}, {{6, 6}}, RAJA::as_array<RAJA::PERM_JI>::get());
 
   /*
    * OffsetLayout with 0 offset should function like the regular Layout.
    */
-  for (int j = 0; j < 6; ++j) {
-    for (int i = 0; i < 6; ++i) {
+  for (int j = 0; j < 6; ++j)
+  {
+    for (int i = 0; i < 6; ++i)
+    {
       ASSERT_EQ(offset(i, j), layout(i, j))
           << layout.strides[0] << layout.strides[1];
     }
@@ -67,10 +66,8 @@ TEST(OffsetLayoutUnitTest, 2D_JI)
    * (-1, -1), (0, -1), (1, -1)
    * (-1, -2), (0, -2), (1, -2)
    */
-  const my_layout layout =
-      RAJA::make_permuted_offset_layout({{-1, -2}},
-                                        {{2, 1}},
-                                        RAJA::as_array<RAJA::PERM_JI>::get());
+  const my_layout layout = RAJA::make_permuted_offset_layout(
+      {{-1, -2}}, {{2, 1}}, RAJA::as_array<RAJA::PERM_JI>::get());
 
   /*
    * First element, (-1, -2), should have index 0.
@@ -107,9 +104,8 @@ TEST(LayoutUnitTest, 3D_KJI_ProjJ)
 
   // Construct using variadic "sizes" ctor
   // Zero for J size should correctly produce projective layout
-  const my_layout layout =
-      RAJA::make_permuted_layout({{3, 0, 7}},
-                                 RAJA::as_array<RAJA::PERM_KJI>::get());
+  const my_layout layout = RAJA::make_permuted_layout(
+      {{3, 0, 7}}, RAJA::as_array<RAJA::PERM_KJI>::get());
 
   ASSERT_EQ(0, layout(0, 0, 0));
 
@@ -124,7 +120,8 @@ TEST(LayoutUnitTest, 3D_KJI_ProjJ)
   ASSERT_EQ(12, layout(0, 0, 4));
 
   // Check that we get the identity (mod 21)
-  for (int x = 0; x < 40; ++x) {
+  for (int x = 0; x < 40; ++x)
+  {
 
     // inverse map
     int i, j, k;
@@ -144,7 +141,8 @@ TEST(LayoutUnitTest, 3D_KJI_ProjJ)
 TEST(LayoutUnitTest, 2D_StrideOne)
 {
   using my_layout = RAJA::Layout<2>;
-  using my_layout_s1 = RAJA::Layout<2, ptrdiff_t, 0>; // first index is stride-1
+  using my_layout_s1 =
+      RAJA::Layout<2, ptrdiff_t, 0>;  // first index is stride-1
 
   /*
    * Construct a 2D layout:
@@ -155,9 +153,8 @@ TEST(LayoutUnitTest, 2D_StrideOne)
    * Linear indices range from [0, 15)
    *
    */
-  const my_layout layout =
-      RAJA::make_permuted_layout({{3, 5}},
-                                 RAJA::as_array<RAJA::PERM_JI>::get());
+  const my_layout layout = RAJA::make_permuted_layout(
+      {{3, 5}}, RAJA::as_array<RAJA::PERM_JI>::get());
 
 
   /*
@@ -167,8 +164,10 @@ TEST(LayoutUnitTest, 2D_StrideOne)
 
 
   // Check that we get the same layout
-  for (int i = 0; i < 3; ++i) {
-    for (int j = 0; j < 5; ++j) {
+  for (int i = 0; i < 3; ++i)
+  {
+    for (int j = 0; j < 5; ++j)
+    {
 
       ASSERT_EQ(layout(i, j), layout_s1(i, j));
     }
@@ -178,44 +177,49 @@ TEST(LayoutUnitTest, 2D_StrideOne)
 TEST(StaticLayoutUnitTest, 2D_StaticLayout)
 {
   RAJA::Layout<2> dynamic_layout(7, 5);
-  using static_layout = RAJA::StaticLayout<RAJA::PERM_IJ,7,5>;
-  
+  using static_layout = RAJA::StaticLayout<RAJA::PERM_IJ, 7, 5>;
+
   // Check that we get the same layout
-  for (int i = 0; i < 7; ++i) {
-    for (int j = 0; j < 5; ++j) {
+  for (int i = 0; i < 7; ++i)
+  {
+    for (int j = 0; j < 5; ++j)
+    {
 
-      ASSERT_EQ(dynamic_layout(i, j), static_layout::s_oper(i,j));
+      ASSERT_EQ(dynamic_layout(i, j), static_layout::s_oper(i, j));
     }
   }
 }
 
 TEST(StaticLayoutUnitTest, 2D_PermutedStaticLayout)
 {
-  auto dynamic_layout = 
-    RAJA::make_permuted_layout({{7, 5}},
-                               RAJA::as_array<RAJA::PERM_JI>::get());
-  using static_layout = RAJA::StaticLayout<RAJA::PERM_JI, 7,5>;
-  
+  auto dynamic_layout = RAJA::make_permuted_layout(
+      {{7, 5}}, RAJA::as_array<RAJA::PERM_JI>::get());
+  using static_layout = RAJA::StaticLayout<RAJA::PERM_JI, 7, 5>;
+
   // Check that we get the same layout
-  for (int i = 0; i < 7; ++i) {
-    for (int j = 0; j < 5; ++j) {
-      ASSERT_EQ(dynamic_layout(i, j), static_layout::s_oper(i,j));
+  for (int i = 0; i < 7; ++i)
+  {
+    for (int j = 0; j < 5; ++j)
+    {
+      ASSERT_EQ(dynamic_layout(i, j), static_layout::s_oper(i, j));
     }
   }
 }
 
 TEST(StaticLayoutUnitTest, 3D_PermutedStaticLayout)
 {
-  auto dynamic_layout = 
-    RAJA::make_permuted_layout({{7, 13, 5}},
-                               RAJA::as_array<RAJA::PERM_JKI>::get());
-  using static_layout = RAJA::StaticLayout<RAJA::PERM_JKI, 7,13,5>;
+  auto dynamic_layout = RAJA::make_permuted_layout(
+      {{7, 13, 5}}, RAJA::as_array<RAJA::PERM_JKI>::get());
+  using static_layout = RAJA::StaticLayout<RAJA::PERM_JKI, 7, 13, 5>;
 
   // Check that we get the same layout
-  for (int i = 0; i < 7; ++i) {
-    for (int j = 0; j < 13; ++j) {
-      for (int k = 0; k < 5; ++k) {
-        ASSERT_EQ(dynamic_layout(i, j, k), static_layout::s_oper(i,j,k));
+  for (int i = 0; i < 7; ++i)
+  {
+    for (int j = 0; j < 13; ++j)
+    {
+      for (int k = 0; k < 5; ++k)
+      {
+        ASSERT_EQ(dynamic_layout(i, j, k), static_layout::s_oper(i, j, k));
       }
     }
   }
@@ -224,21 +228,23 @@ TEST(StaticLayoutUnitTest, 3D_PermutedStaticLayout)
 
 TEST(StaticLayoutUnitTest, 4D_PermutedStaticLayout)
 {
-  auto dynamic_layout = 
-    RAJA::make_permuted_layout({{7, 13, 5, 17}},
-                               RAJA::as_array<RAJA::PERM_LJKI>::get());
-  using static_layout = RAJA::StaticLayout<RAJA::PERM_LJKI, 7,13,5,17>;
+  auto dynamic_layout = RAJA::make_permuted_layout(
+      {{7, 13, 5, 17}}, RAJA::as_array<RAJA::PERM_LJKI>::get());
+  using static_layout = RAJA::StaticLayout<RAJA::PERM_LJKI, 7, 13, 5, 17>;
 
   // Check that we get the same layout
-  for (int i = 0; i < 7; ++i) {
-    for (int j = 0; j < 13; ++j) {
-      for (int k = 0; k < 5; ++k) {
-        for (int l = 0; l < 5; ++l) {
-          ASSERT_EQ(dynamic_layout(i, j, k, l), static_layout::s_oper(i,j,k,l));
-        } 
+  for (int i = 0; i < 7; ++i)
+  {
+    for (int j = 0; j < 13; ++j)
+    {
+      for (int k = 0; k < 5; ++k)
+      {
+        for (int l = 0; l < 5; ++l)
+        {
+          ASSERT_EQ(dynamic_layout(i, j, k, l),
+                    static_layout::s_oper(i, j, k, l));
+        }
       }
     }
   }
 }
-
-
diff --git a/test/unit/view-layout/test-multiview.cpp b/test/unit/view-layout/test-multiview.cpp
index c841c718a6..60efb37df9 100644
--- a/test/unit/view-layout/test-multiview.cpp
+++ b/test/unit/view-layout/test-multiview.cpp
@@ -11,14 +11,17 @@
 RAJA_INDEX_VALUE(TIX, "TIX");
 RAJA_INDEX_VALUE(TIL, "TIL");
 
-template<typename T>
-class MultiViewUnitTest : public ::testing::Test {};
+template <typename T>
+class MultiViewUnitTest : public ::testing::Test
+{};
 
-template<typename T>
-class OffsetLayoutMultiViewUnitTest : public ::testing::Test {};
+template <typename T>
+class OffsetLayoutMultiViewUnitTest : public ::testing::Test
+{};
 
-template<typename T>
-class TypedIntegralMultiViewUnitTest : public ::testing::Test {};
+template <typename T>
+class TypedIntegralMultiViewUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(MultiViewUnitTest, UnitIntFloatTypes);
 TYPED_TEST_SUITE(OffsetLayoutMultiViewUnitTest, UnitIntFloatTypes);
@@ -29,72 +32,76 @@ TYPED_TEST(MultiViewUnitTest, Constructors)
 
   using layout = RAJA::Layout<1>;
 
-  TypeParam   a1[10];
-  TypeParam   a2[10];
-  TypeParam * data[2];
+  TypeParam a1[10];
+  TypeParam a2[10];
+  TypeParam* data[2];
 
   data[0] = a1;
   data[1] = a2;
 
   constexpr int val = 8;
-  a1[0] = val;
-  a2[0] = val;
+  a1[0]             = val;
+  a2[0]             = val;
 
   RAJA::MultiView<TypeParam, layout> view(data, layout(10));
-  ASSERT_EQ( val, view(0,0) );
+  ASSERT_EQ(val, view(0, 0));
 
   /*
-   * Should be able to construct a non-const MultiView from a non-const MultiView
+   * Should be able to construct a non-const MultiView from a non-const
+   * MultiView
    */
   RAJA::MultiView<TypeParam, layout> view2(view);
-  ASSERT_EQ( val, view2(0,0) );
+  ASSERT_EQ(val, view2(0, 0));
 
   /*
    * Should be able to construct a const MultiView from a non-const MultiView
    */
   RAJA::MultiView<TypeParam const, layout> const_view(view);
-  ASSERT_EQ( val, const_view(0,0) );
+  ASSERT_EQ(val, const_view(0, 0));
 
   /*
    * Should be able to construct a const MultiView from a const MultiView
    */
   RAJA::MultiView<TypeParam const, layout> const_view2(const_view);
-  ASSERT_EQ( val, const_view2(0,0) );
+  ASSERT_EQ(val, const_view2(0, 0));
 
 
-  // non-default construction of MultiView with array-of-pointers index moved to 1st position
+  // non-default construction of MultiView with array-of-pointers index moved to
+  // 1st position
   RAJA::MultiView<TypeParam, layout, 1> view1p(data, layout(10));
-  ASSERT_EQ( val, view1p(0,0) );
+  ASSERT_EQ(val, view1p(0, 0));
 
   // construct a non-const MultiView from a non-const MultiView
   RAJA::MultiView<TypeParam, layout, 1> view1p2(view1p);
-  ASSERT_EQ( val, view1p2(0,0) );
+  ASSERT_EQ(val, view1p2(0, 0));
 
   // construct a const MultiView from a non-const MultiView
   RAJA::MultiView<TypeParam const, layout, 1> const_view1p(view1p);
-  ASSERT_EQ( val, const_view1p(0,0) );
+  ASSERT_EQ(val, const_view1p(0, 0));
 
   // construct a const MultiView from a const MultiView
   RAJA::MultiView<TypeParam const, layout, 1> const_view1p2(const_view1p);
-  ASSERT_EQ( val, const_view1p2(0,0) );
+  ASSERT_EQ(val, const_view1p2(0, 0));
 
 
-  // non-default construction of MultiView with array-of-pointers index moved to 1st position
-  // and non-const pointer type specification (used in CHAI)
-  RAJA::MultiView<TypeParam, layout, 1, TypeParam **> view1pnc(data, layout(10));
-  ASSERT_EQ( val, view1pnc(0,0) );
+  // non-default construction of MultiView with array-of-pointers index moved to
+  // 1st position and non-const pointer type specification (used in CHAI)
+  RAJA::MultiView<TypeParam, layout, 1, TypeParam**> view1pnc(data, layout(10));
+  ASSERT_EQ(val, view1pnc(0, 0));
 
   // construct a non-const MultiView from a non-const MultiView
-  RAJA::MultiView<TypeParam, layout, 1, TypeParam **> view1pnc2(view1pnc);
-  ASSERT_EQ( val, view1pnc2(0,0) );
+  RAJA::MultiView<TypeParam, layout, 1, TypeParam**> view1pnc2(view1pnc);
+  ASSERT_EQ(val, view1pnc2(0, 0));
 
   // construct a const MultiView from a non-const MultiView
-  RAJA::MultiView<TypeParam const, layout, 1, TypeParam **> const_view1pnc(view1pnc);
-  ASSERT_EQ( val, const_view1pnc(0,0) );
+  RAJA::MultiView<TypeParam const, layout, 1, TypeParam**> const_view1pnc(
+      view1pnc);
+  ASSERT_EQ(val, const_view1pnc(0, 0));
 
   // construct a const MultiView from a const MultiView
-  RAJA::MultiView<TypeParam const, layout, 1, TypeParam **> const_view1pnc2(const_view1pnc);
-  ASSERT_EQ( val, const_view1pnc2(0,0) );
+  RAJA::MultiView<TypeParam const, layout, 1, TypeParam**> const_view1pnc2(
+      const_view1pnc);
+  ASSERT_EQ(val, const_view1pnc2(0, 0));
 }
 
 TYPED_TEST(MultiViewUnitTest, Accessor)
@@ -103,48 +110,51 @@ TYPED_TEST(MultiViewUnitTest, Accessor)
   const int Nx = 3;
   const int Ny = 5;
   const int Nz = 2;
-  const int N  = Nx*Ny*Nz;
-  TypeParam *b = new TypeParam[N];
-  TypeParam *c = new TypeParam[N];
-  TypeParam *a[2];
+  const int N  = Nx * Ny * Nz;
+  TypeParam* b = new TypeParam[N];
+  TypeParam* c = new TypeParam[N];
+  TypeParam* a[2];
 
   a[0] = b;
   a[1] = c;
 
-  int iter{0};
-  for(TypeParam i=0; i<TypeParam{N}; ++i)
+  int iter {0};
+  for (TypeParam i = 0; i < TypeParam {N}; ++i)
   {
-    a[0][iter] = TypeParam{i};
-    a[1][iter] = TypeParam{i}+1;
+    a[0][iter] = TypeParam {i};
+    a[1][iter] = TypeParam {i} + 1;
     ++iter;
   }
 
   /*
    * 1D Accessor
    */
-  RAJA::MultiView<TypeParam, RAJA::Layout<1>> view_1D(a,N);
-  RAJA::MultiView<TypeParam, RAJA::Layout<1>,1> view_1D1p(a,N);
-  TypeParam val{0};
-  for(int i=0; i<N; ++i) {
-    ASSERT_EQ(val, view_1D(0,i));
-    ASSERT_EQ(val+1, view_1D(1,i));
-    ASSERT_EQ(val, view_1D1p(i,0));
-    ASSERT_EQ(val+1, view_1D1p(i,1));
+  RAJA::MultiView<TypeParam, RAJA::Layout<1>> view_1D(a, N);
+  RAJA::MultiView<TypeParam, RAJA::Layout<1>, 1> view_1D1p(a, N);
+  TypeParam val {0};
+  for (int i = 0; i < N; ++i)
+  {
+    ASSERT_EQ(val, view_1D(0, i));
+    ASSERT_EQ(val + 1, view_1D(1, i));
+    ASSERT_EQ(val, view_1D1p(i, 0));
+    ASSERT_EQ(val + 1, view_1D1p(i, 1));
     val++;
   }
 
   /*
    * 2D Accessor
    */
-  RAJA::MultiView<TypeParam, RAJA::Layout<2>> view_2D(a,Ny,Nx);
-  RAJA::MultiView<TypeParam, RAJA::Layout<2>,1> view_2D1p(a,Ny,Nx);
-  val = TypeParam{0};
-  for(int j=0; j<Ny; ++j) {
-    for(int i=0; i<Nx; ++i) {
-      ASSERT_EQ(val, view_2D(0,j,i));
-      ASSERT_EQ(val+1, view_2D(1,j,i));
-      ASSERT_EQ(val, view_2D1p(j,0,i));
-      ASSERT_EQ(val+1, view_2D1p(j,1,i));
+  RAJA::MultiView<TypeParam, RAJA::Layout<2>> view_2D(a, Ny, Nx);
+  RAJA::MultiView<TypeParam, RAJA::Layout<2>, 1> view_2D1p(a, Ny, Nx);
+  val = TypeParam {0};
+  for (int j = 0; j < Ny; ++j)
+  {
+    for (int i = 0; i < Nx; ++i)
+    {
+      ASSERT_EQ(val, view_2D(0, j, i));
+      ASSERT_EQ(val + 1, view_2D(1, j, i));
+      ASSERT_EQ(val, view_2D1p(j, 0, i));
+      ASSERT_EQ(val + 1, view_2D1p(j, 1, i));
       val++;
     }
   }
@@ -152,16 +162,19 @@ TYPED_TEST(MultiViewUnitTest, Accessor)
   /*
    * 3D Accessor
    */
-  RAJA::MultiView<TypeParam, RAJA::Layout<3>> view_3D(a,Nz,Ny,Nx);
-  RAJA::MultiView<TypeParam, RAJA::Layout<3>,2> view_3D1p(a,Nz,Ny,Nx);
-  val = TypeParam{0};
-  for(int k=0; k<Nz; ++k) {
-    for(int j=0; j<Ny; ++j) {
-      for(int i=0; i<Nx; ++i) {
-        ASSERT_EQ(val, view_3D(0,k,j,i));
-        ASSERT_EQ(val+1, view_3D(1,k,j,i));
-        ASSERT_EQ(val, view_3D1p(k,j,0,i));
-        ASSERT_EQ(val+1, view_3D1p(k,j,1,i));
+  RAJA::MultiView<TypeParam, RAJA::Layout<3>> view_3D(a, Nz, Ny, Nx);
+  RAJA::MultiView<TypeParam, RAJA::Layout<3>, 2> view_3D1p(a, Nz, Ny, Nx);
+  val = TypeParam {0};
+  for (int k = 0; k < Nz; ++k)
+  {
+    for (int j = 0; j < Ny; ++j)
+    {
+      for (int i = 0; i < Nx; ++i)
+      {
+        ASSERT_EQ(val, view_3D(0, k, j, i));
+        ASSERT_EQ(val + 1, view_3D(1, k, j, i));
+        ASSERT_EQ(val, view_3D1p(k, j, 0, i));
+        ASSERT_EQ(val + 1, view_3D1p(k, j, 1, i));
         val++;
       }
     }
@@ -185,20 +198,23 @@ TYPED_TEST(OffsetLayoutMultiViewUnitTest, View)
   /*
    * MultiView is constructed by passing in the layout.
    */
-  std::array<RAJA::Index_type, 1> lower{{1}};
-  std::array<RAJA::Index_type, 1> upper{{11}};
-  RAJA::MultiView<TypeParam, layout> view(data, RAJA::make_offset_layout<1>(lower, upper));
-  RAJA::MultiView<TypeParam, layout,1> view1p(data, RAJA::make_offset_layout<1>(lower, upper));
-
-  for (int i = 0; i < 10; i++) {
+  std::array<RAJA::Index_type, 1> lower {{1}};
+  std::array<RAJA::Index_type, 1> upper {{11}};
+  RAJA::MultiView<TypeParam, layout> view(
+      data, RAJA::make_offset_layout<1>(lower, upper));
+  RAJA::MultiView<TypeParam, layout, 1> view1p(
+      data, RAJA::make_offset_layout<1>(lower, upper));
+
+  for (int i = 0; i < 10; i++)
+  {
     data[0][i] = static_cast<TypeParam>(i);
-    data[1][i] = static_cast<TypeParam>(i+1);
+    data[1][i] = static_cast<TypeParam>(i + 1);
   }
 
-  ASSERT_EQ(data[0][0], view(0,1));
-  ASSERT_EQ(data[1][9], view(1,10));
-  ASSERT_EQ(data[0][0], view1p(1,0));
-  ASSERT_EQ(data[1][9], view1p(10,1));
+  ASSERT_EQ(data[0][0], view(0, 1));
+  ASSERT_EQ(data[1][9], view(1, 10));
+  ASSERT_EQ(data[0][0], view1p(1, 0));
+  ASSERT_EQ(data[1][9], view1p(10, 1));
 
   delete[] d1;
   delete[] d2;
@@ -207,48 +223,50 @@ TYPED_TEST(OffsetLayoutMultiViewUnitTest, View)
 TYPED_TEST(MultiViewUnitTest, Shift1D)
 {
 
-  int N = 10;
-  TypeParam *reala = new TypeParam[N];
-  TypeParam *realb = new TypeParam[N];
-  TypeParam *a[2];
+  int N            = 10;
+  TypeParam* reala = new TypeParam[N];
+  TypeParam* realb = new TypeParam[N];
+  TypeParam* a[2];
   a[0] = reala;
   a[1] = realb;
 
-  //Create a view from a base view
-  const int DIM = 1;
-  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0}},{{N}});
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> A(a,layout);
-  RAJA::MultiView<TypeParam, RAJA::Layout<DIM>> B(a,N);
+  // Create a view from a base view
+  const int DIM                  = 1;
+  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0}}, {{N}});
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> A(a, layout);
+  RAJA::MultiView<TypeParam, RAJA::Layout<DIM>> B(a, N);
 
-  for(int i=0; i<N; ++i) {
-    A(0,i) = static_cast<TypeParam>(i + 1);
-    B(1,i) = static_cast<TypeParam>(i + 1);
+  for (int i = 0; i < N; ++i)
+  {
+    A(0, i) = static_cast<TypeParam>(i + 1);
+    B(1, i) = static_cast<TypeParam>(i + 1);
   }
 
   RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Ashift = A.shift({{N}});
   RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Bshift = B.shift({{N}});
 
-  for(int i=N; i<2*N; ++i)
+  for (int i = N; i < 2 * N; ++i)
   {
-    ASSERT_EQ(Ashift(0,i),A(0,i-N));
-    ASSERT_EQ(Bshift(1,i),B(1,i-N));
+    ASSERT_EQ(Ashift(0, i), A(0, i - N));
+    ASSERT_EQ(Bshift(1, i), B(1, i - N));
   }
 
   // offset layout with MultiView with array-of-pointers index in 1st position
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>, 1> C(a,layout);
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>, 1> Cshift = C.shift({{N}});
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>, 1> C(a, layout);
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>, 1> Cshift =
+      C.shift({{N}});
 
-  for(int i=N; i<2*N; ++i)
+  for (int i = N; i < 2 * N; ++i)
   {
-    ASSERT_EQ(Cshift(i,0),C(i-N,0));
-    ASSERT_EQ(Cshift(i,1),C(i-N,1));
-    ASSERT_EQ(Ashift(0,i),C(i-N,0));
-    ASSERT_EQ(Cshift(i,0),A(0,i-N));
+    ASSERT_EQ(Cshift(i, 0), C(i - N, 0));
+    ASSERT_EQ(Cshift(i, 1), C(i - N, 1));
+    ASSERT_EQ(Ashift(0, i), C(i - N, 0));
+    ASSERT_EQ(Cshift(i, 0), A(0, i - N));
   }
 
 
-  //Create a shifted view from a view with a typed layout
-  using TLayout = RAJA::TypedLayout<TIL, RAJA::tuple<TIX>>;
+  // Create a shifted view from a view with a typed layout
+  using TLayout       = RAJA::TypedLayout<TIL, RAJA::tuple<TIX>>;
   using TOffsetLayout = RAJA::TypedOffsetLayout<TIL, RAJA::tuple<TIX>>;
 
   TLayout myLayout(10);
@@ -256,9 +274,9 @@ TYPED_TEST(MultiViewUnitTest, Shift1D)
   RAJA::MultiView<TypeParam, TLayout> D(a, myLayout);
   RAJA::MultiView<TypeParam, TOffsetLayout> Dshift = D.shift({{N}});
 
-  for(TIX i=TIX{N}; i<TIX{2*N}; ++i)
+  for (TIX i = TIX {N}; i < TIX {2 * N}; ++i)
   {
-    ASSERT_EQ(Dshift(0,i),D(0,i-N));
+    ASSERT_EQ(Dshift(0, i), D(0, i - N));
   };
 
   delete[] reala;
@@ -268,54 +286,65 @@ TYPED_TEST(MultiViewUnitTest, Shift1D)
 TYPED_TEST(MultiViewUnitTest, Shift2D)
 {
 
-  int N = 10;
-  TypeParam *a0 = new TypeParam[N*N];
-  TypeParam *b0 = new TypeParam[N*N];
-  TypeParam *a[2];
+  int N         = 10;
+  TypeParam* a0 = new TypeParam[N * N];
+  TypeParam* b0 = new TypeParam[N * N];
+  TypeParam* a[2];
   a[0] = a0;
   a[1] = b0;
 
   const int DIM = 2;
-  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0,0}},{{N,N}});
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> A(a,layout);
-  RAJA::MultiView<TypeParam, RAJA::Layout<DIM>> B(a,N,N);
-
-  for(int y=0; y<N; ++y) {
-    for(int x=0; x<N; ++x) {
-      A(0,y,x) = static_cast<TypeParam>(x + N*y);
-      B(1,y,x) = static_cast<TypeParam>(x + N*y + 1);
+  RAJA::OffsetLayout<DIM> layout =
+      RAJA::make_offset_layout<DIM>({{0, 0}}, {{N, N}});
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> A(a, layout);
+  RAJA::MultiView<TypeParam, RAJA::Layout<DIM>> B(a, N, N);
+
+  for (int y = 0; y < N; ++y)
+  {
+    for (int x = 0; x < N; ++x)
+    {
+      A(0, y, x) = static_cast<TypeParam>(x + N * y);
+      B(1, y, x) = static_cast<TypeParam>(x + N * y + 1);
     }
   }
 
-  //Create a view from a base view with an offsetlayout
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Ashift = A.shift({{N,N}});
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Bshift = B.shift({{N,N}});
+  // Create a view from a base view with an offsetlayout
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Ashift =
+      A.shift({{N, N}});
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Bshift =
+      B.shift({{N, N}});
 
-  for(int y=N; y<N+N; ++y) {
-    for(int x=N; x<N+N; ++x) {
-      ASSERT_EQ(Ashift(0,y,x),A(0,y-N,x-N));
-      ASSERT_EQ(Bshift(1,y,x),B(1,y-N,x-N));
+  for (int y = N; y < N + N; ++y)
+  {
+    for (int x = N; x < N + N; ++x)
+    {
+      ASSERT_EQ(Ashift(0, y, x), A(0, y - N, x - N));
+      ASSERT_EQ(Bshift(1, y, x), B(1, y - N, x - N));
     }
   }
 
-  //Create a view from a base view with permuted layout
-  std::array< RAJA::idx_t, 2> perm {{1, 0}};
+  // Create a view from a base view with permuted layout
+  std::array<RAJA::idx_t, 2> perm {{1, 0}};
   RAJA::OffsetLayout<2> playout =
-    RAJA::make_permuted_offset_layout<2>( {{0, 0}}, {{N, N}}, perm );
+      RAJA::make_permuted_offset_layout<2>({{0, 0}}, {{N, N}}, perm);
 
   RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> C(a, playout);
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Cshift = C.shift({{N,N}});
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>,1> D(a, playout);
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>,1> Dshift1p = D.shift({{N,N}});
-
-  for(int y=N; y<N+N; ++y) {
-    for(int x=N; x<N+N; ++x) {
-      ASSERT_EQ(Cshift(0,y,x),C(0,y-N,x-N));
-      ASSERT_EQ(Cshift(1,y,x),C(1,y-N,x-N));
-      ASSERT_EQ(Dshift1p(y,0,x),D(y-N,0,x-N));
-      ASSERT_EQ(Dshift1p(y,1,x),D(y-N,1,x-N));
-      ASSERT_EQ(Dshift1p(y,1,x),C(1,y-N,x-N));
-      ASSERT_EQ(Cshift(0,y,x),D(y-N,0,x-N));
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Cshift =
+      C.shift({{N, N}});
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>, 1> D(a, playout);
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>, 1> Dshift1p =
+      D.shift({{N, N}});
+
+  for (int y = N; y < N + N; ++y)
+  {
+    for (int x = N; x < N + N; ++x)
+    {
+      ASSERT_EQ(Cshift(0, y, x), C(0, y - N, x - N));
+      ASSERT_EQ(Cshift(1, y, x), C(1, y - N, x - N));
+      ASSERT_EQ(Dshift1p(y, 0, x), D(y - N, 0, x - N));
+      ASSERT_EQ(Dshift1p(y, 1, x), D(y - N, 1, x - N));
+      ASSERT_EQ(Dshift1p(y, 1, x), C(1, y - N, x - N));
+      ASSERT_EQ(Cshift(0, y, x), D(y - N, 0, x - N));
     }
   }
 
diff --git a/test/unit/view-layout/test-standard-layout.cpp b/test/unit/view-layout/test-standard-layout.cpp
index 160e39ac36..cf7ce50b79 100644
--- a/test/unit/view-layout/test-standard-layout.cpp
+++ b/test/unit/view-layout/test-standard-layout.cpp
@@ -69,7 +69,8 @@ TEST(LayoutUnitTest, 2D_IJ)
   ASSERT_EQ(4, layout(0, 4));
 
   // Check that we get the identity
-  for (int k = 0; k < 15; ++k) {
+  for (int k = 0; k < 15; ++k)
+  {
 
     // inverse map
     int i, j;
@@ -100,9 +101,8 @@ TEST(LayoutUnitTest, 2D_JI)
    * Linear indices range from [0, 15)
    *
    */
-  const my_layout layout =
-      RAJA::make_permuted_layout({{3, 5}},
-                                 RAJA::as_array<RAJA::PERM_JI>::get());
+  const my_layout layout = RAJA::make_permuted_layout(
+      {{3, 5}}, RAJA::as_array<RAJA::PERM_JI>::get());
 
   ASSERT_EQ(0, layout(0, 0));
 
@@ -113,7 +113,8 @@ TEST(LayoutUnitTest, 2D_JI)
   ASSERT_EQ(14, layout(2, 4));
 
   // Check that we get the identity (mod 15)
-  for (int k = 0; k < 15; ++k) {
+  for (int k = 0; k < 15; ++k)
+  {
 
     // inverse map
     int i, j;
@@ -158,7 +159,8 @@ TEST(LayoutUnitTest, 2D_IJ_ProjJ)
   ASSERT_EQ(0, layout(0, 5));
 
   // Check that we get the identity (mod 7)
-  for (int k = 0; k < 20; ++k) {
+  for (int k = 0; k < 20; ++k)
+  {
 
     // inverse map
     int i, j;
@@ -174,4 +176,3 @@ TEST(LayoutUnitTest, 2D_IJ_ProjJ)
     ASSERT_EQ(j, 0);
   }
 }
-
diff --git a/test/unit/view-layout/test-typedlayout.cpp b/test/unit/view-layout/test-typedlayout.cpp
index 6820da9b52..f15b0c40b4 100644
--- a/test/unit/view-layout/test-typedlayout.cpp
+++ b/test/unit/view-layout/test-typedlayout.cpp
@@ -8,8 +8,9 @@
 #include "RAJA_test-base.hpp"
 #include "RAJA_unit-test-types.hpp"
 
-template<typename T>
-class TypedLayoutUnitTest : public ::testing::Test {};
+template <typename T>
+class TypedLayoutUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(TypedLayoutUnitTest, UnitIndexTypes);
 
@@ -17,24 +18,26 @@ TYPED_TEST_SUITE(TypedLayoutUnitTest, UnitIndexTypes);
 TYPED_TEST(TypedLayoutUnitTest, TypedLayoutConstructors)
 {
 
-  const RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>> l(10,5);
+  const RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>> l(10,
+                                                                          5);
 
-  ASSERT_EQ(TypeParam{0}, l(TypeParam{0}, TypeParam{0}));
+  ASSERT_EQ(TypeParam {0}, l(TypeParam {0}, TypeParam {0}));
 
-  ASSERT_EQ(TypeParam{2}, l(TypeParam{0}, TypeParam{2}));
+  ASSERT_EQ(TypeParam {2}, l(TypeParam {0}, TypeParam {2}));
 
-  ASSERT_EQ(TypeParam{10}, l(TypeParam{2}, TypeParam{0}));
+  ASSERT_EQ(TypeParam {10}, l(TypeParam {2}, TypeParam {0}));
 
-  TypeParam x{5};
-  TypeParam y{0};
-  l.toIndices(TypeParam{10}, y, x);
-  ASSERT_EQ(x, TypeParam{0});
-  ASSERT_EQ(y, TypeParam{2});
+  TypeParam x {5};
+  TypeParam y {0};
+  l.toIndices(TypeParam {10}, y, x);
+  ASSERT_EQ(x, TypeParam {0});
+  ASSERT_EQ(y, TypeParam {2});
 }
 
 TYPED_TEST(TypedLayoutUnitTest, 2D_accessor)
 {
-  using my_layout = RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>>;
+  using my_layout =
+      RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>>;
 
   /*
    * Construct a 2D layout:
@@ -66,7 +69,8 @@ TYPED_TEST(TypedLayoutUnitTest, 2D_accessor)
   ASSERT_EQ(TypeParam(4), layout(0, 4));
 
   // Check that we get the identity
-  for (int k = 0; k < 15; ++k) {
+  for (int k = 0; k < 15; ++k)
+  {
 
     // inverse map
     TypeParam i, j;
@@ -82,12 +86,12 @@ TYPED_TEST(TypedLayoutUnitTest, 2D_accessor)
     ASSERT_EQ(k2, layout_a(i, j));
     ASSERT_EQ(k2, layout_b(i, j));
   }
-
 }
 
 TYPED_TEST(TypedLayoutUnitTest, 2D_IJ_ProjJ)
 {
-  using my_layout = RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>>;
+  using my_layout =
+      RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>>;
 
   /*
    * Construct a 2D projective layout:
@@ -118,7 +122,8 @@ TYPED_TEST(TypedLayoutUnitTest, 2D_IJ_ProjJ)
 
   TypeParam pK = 0;
   // Check that we get the identity (mod 7)
-  for (int k = 0; k < 20; ++k) {
+  for (int k = 0; k < 20; ++k)
+  {
 
     // inverse map
     TypeParam i, j;
@@ -139,50 +144,57 @@ TYPED_TEST(TypedLayoutUnitTest, 2D_IJ_ProjJ)
 TYPED_TEST(TypedLayoutUnitTest, 2D_StaticLayout)
 {
   RAJA::Layout<2, TypeParam> dynamic_layout(7, 5);
-  using static_layout = RAJA::TypedStaticLayout<RAJA::PERM_IJ,TypeParam,RAJA::list<TypeParam,TypeParam>,7,5>;
+  using static_layout =
+      RAJA::TypedStaticLayout<RAJA::PERM_IJ, TypeParam,
+                              RAJA::list<TypeParam, TypeParam>, 7, 5>;
 
   // Check that we get the same layout
-  for (TypeParam i = 0; i < 7; ++i) {
-    for (TypeParam j = 0; j < 5; ++j) {
+  for (TypeParam i = 0; i < 7; ++i)
+  {
+    for (TypeParam j = 0; j < 5; ++j)
+    {
 
-      ASSERT_EQ(dynamic_layout(i, j), static_layout::s_oper(i,j));
+      ASSERT_EQ(dynamic_layout(i, j), static_layout::s_oper(i, j));
     }
   }
-
 }
 
 TYPED_TEST(TypedLayoutUnitTest, 2D_PermutedStaticLayout)
 {
-  auto dynamic_layout =
-    RAJA::make_permuted_layout({{7, 5}},
-                               RAJA::as_array<RAJA::PERM_JI>::get());
-  using static_layout = RAJA::TypedStaticLayout<RAJA::PERM_JI,
-                                                TypeParam,
-                                                RAJA::list<TypeParam,TypeParam>, 7,5>;
+  auto dynamic_layout = RAJA::make_permuted_layout(
+      {{7, 5}}, RAJA::as_array<RAJA::PERM_JI>::get());
+  using static_layout =
+      RAJA::TypedStaticLayout<RAJA::PERM_JI, TypeParam,
+                              RAJA::list<TypeParam, TypeParam>, 7, 5>;
 
   // Check that we get the same layout
-  for (TypeParam i = 0; i < 7; ++i) {
-    for (TypeParam j = 0; j < 5; ++j) {
-      ASSERT_EQ(TypeParam(dynamic_layout(i, j)), static_layout::s_oper(i,j));
+  for (TypeParam i = 0; i < 7; ++i)
+  {
+    for (TypeParam j = 0; j < 5; ++j)
+    {
+      ASSERT_EQ(TypeParam(dynamic_layout(i, j)), static_layout::s_oper(i, j));
     }
   }
 }
 
 TYPED_TEST(TypedLayoutUnitTest, 3D_PermutedStaticLayout)
 {
-  auto dynamic_layout =
-    RAJA::make_permuted_layout({{7, 13, 5}},
-                               RAJA::as_array<RAJA::PERM_JKI>::get());
-  using static_layout = RAJA::TypedStaticLayout<RAJA::PERM_JKI,
-                                                TypeParam,
-                                                RAJA::list<TypeParam,TypeParam,TypeParam>,
-                                                7,13,5>;
+  auto dynamic_layout = RAJA::make_permuted_layout(
+      {{7, 13, 5}}, RAJA::as_array<RAJA::PERM_JKI>::get());
+  using static_layout =
+      RAJA::TypedStaticLayout<RAJA::PERM_JKI, TypeParam,
+                              RAJA::list<TypeParam, TypeParam, TypeParam>, 7,
+                              13, 5>;
 
   // Check that we get the same layout
-  for (TypeParam i = 0; i < 7; ++i) {
-    for (TypeParam j = 0; j < 9; ++j) {
-      for (TypeParam k = 0; k < 5; ++k) {
-        ASSERT_EQ(TypeParam(dynamic_layout(i, j, k)), static_layout::s_oper(i,j,k));
+  for (TypeParam i = 0; i < 7; ++i)
+  {
+    for (TypeParam j = 0; j < 9; ++j)
+    {
+      for (TypeParam k = 0; k < 5; ++k)
+      {
+        ASSERT_EQ(TypeParam(dynamic_layout(i, j, k)),
+                  static_layout::s_oper(i, j, k));
       }
     }
   }
@@ -191,20 +203,23 @@ TYPED_TEST(TypedLayoutUnitTest, 3D_PermutedStaticLayout)
 
 TYPED_TEST(TypedLayoutUnitTest, 4D_PermutedStaticLayout)
 {
-  auto dynamic_layout =
-    RAJA::make_permuted_layout({{7, 13, 5, 17}},
-                               RAJA::as_array<RAJA::PERM_LJKI>::get());
-  using static_layout = RAJA::TypedStaticLayout<RAJA::PERM_LJKI,
-                                                TypeParam,
-                                                RAJA::list<TypeParam,TypeParam,TypeParam,TypeParam>,
-                                                7,13,5,17>;
+  auto dynamic_layout = RAJA::make_permuted_layout(
+      {{7, 13, 5, 17}}, RAJA::as_array<RAJA::PERM_LJKI>::get());
+  using static_layout = RAJA::TypedStaticLayout<
+      RAJA::PERM_LJKI, TypeParam,
+      RAJA::list<TypeParam, TypeParam, TypeParam, TypeParam>, 7, 13, 5, 17>;
 
   // Check that we get the same layout
-  for (TypeParam i = 0; i < 7; ++i) {
-    for (TypeParam j = 0; j < 8; ++j) {
-      for (TypeParam k = 0; k < 5; ++k) {
-        for (TypeParam l = 0; l < 5; ++l) {
-          ASSERT_EQ(TypeParam(dynamic_layout(i, j, k, l)), static_layout::s_oper(i,j,k,l));
+  for (TypeParam i = 0; i < 7; ++i)
+  {
+    for (TypeParam j = 0; j < 8; ++j)
+    {
+      for (TypeParam k = 0; k < 5; ++k)
+      {
+        for (TypeParam l = 0; l < 5; ++l)
+        {
+          ASSERT_EQ(TypeParam(dynamic_layout(i, j, k, l)),
+                    static_layout::s_oper(i, j, k, l));
         }
       }
     }
diff --git a/test/unit/view-layout/test-typedview.cpp b/test/unit/view-layout/test-typedview.cpp
index b0823b93e0..e57e884edf 100644
--- a/test/unit/view-layout/test-typedview.cpp
+++ b/test/unit/view-layout/test-typedview.cpp
@@ -13,14 +13,17 @@ RAJA_INDEX_VALUE(TIX, "TIX");
 RAJA_INDEX_VALUE(TIY, "TIY");
 RAJA_INDEX_VALUE(TIL, "TIL");
 
-template<typename T>
-class TypedViewUnitTest : public ::testing::Test {};
+template <typename T>
+class TypedViewUnitTest : public ::testing::Test
+{};
 
-template<typename T>
-class OffsetLayoutViewUnitTest : public ::testing::Test {};
+template <typename T>
+class OffsetLayoutViewUnitTest : public ::testing::Test
+{};
 
-template<typename T>
-class TypedIntegralViewUnitTest : public ::testing::Test {};
+template <typename T>
+class TypedIntegralViewUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(TypedViewUnitTest, UnitIntFloatTypes);
 TYPED_TEST_SUITE(OffsetLayoutViewUnitTest, UnitIntFloatTypes);
@@ -63,22 +66,23 @@ TYPED_TEST(TypedViewUnitTest, Accessor)
   const int Nx = 3;
   const int Ny = 5;
   const int Nz = 2;
-  const int N  = Nx*Ny*Nz;
-  TypeParam *a = new TypeParam[N];
+  const int N  = Nx * Ny * Nz;
+  TypeParam* a = new TypeParam[N];
 
-  int iter{0};
-  for(TypeParam i=0; i<TypeParam{N}; ++i)
+  int iter {0};
+  for (TypeParam i = 0; i < TypeParam {N}; ++i)
   {
-    a[iter] = TypeParam{i};
+    a[iter] = TypeParam {i};
     ++iter;
   }
 
   /*
    * 1D Accessor
    */
-  RAJA::View<TypeParam, RAJA::Layout<1>> view_1D(a,N);
-  TypeParam val{0};
-  for(int i=0; i<N; ++i) {
+  RAJA::View<TypeParam, RAJA::Layout<1>> view_1D(a, N);
+  TypeParam val {0};
+  for (int i = 0; i < N; ++i)
+  {
     ASSERT_EQ(val, view_1D(i));
     val++;
   }
@@ -86,11 +90,13 @@ TYPED_TEST(TypedViewUnitTest, Accessor)
   /*
    * 2D Accessor
    */
-  RAJA::View<TypeParam, RAJA::Layout<2>> view_2D(a,Ny,Nx);
-  val = TypeParam{0};
-  for(int j=0; j<Ny; ++j) {
-    for(int i=0; i<Nx; ++i) {
-      ASSERT_EQ(val, view_2D(j,i));
+  RAJA::View<TypeParam, RAJA::Layout<2>> view_2D(a, Ny, Nx);
+  val = TypeParam {0};
+  for (int j = 0; j < Ny; ++j)
+  {
+    for (int i = 0; i < Nx; ++i)
+    {
+      ASSERT_EQ(val, view_2D(j, i));
       val++;
     }
   }
@@ -98,12 +104,15 @@ TYPED_TEST(TypedViewUnitTest, Accessor)
   /*
    * 3D Accessor
    */
-  RAJA::View<TypeParam, RAJA::Layout<3>> view_3D(a,Nz,Ny,Nx);
-  val = TypeParam{0};
-  for(int k=0; k<Nz; ++k) {
-    for(int j=0; j<Ny; ++j) {
-      for(int i=0; i<Nx; ++i) {
-        ASSERT_EQ(val, view_3D(k,j,i));
+  RAJA::View<TypeParam, RAJA::Layout<3>> view_3D(a, Nz, Ny, Nx);
+  val = TypeParam {0};
+  for (int k = 0; k < Nz; ++k)
+  {
+    for (int j = 0; j < Ny; ++j)
+    {
+      for (int i = 0; i < Nx; ++i)
+      {
+        ASSERT_EQ(val, view_3D(k, j, i));
         val++;
       }
     }
@@ -117,22 +126,23 @@ TYPED_TEST(TypedIntegralViewUnitTest, TypedAccessor)
   const int Nx = 3;
   const int Ny = 5;
   const int Nz = 2;
-  const int N  = Nx*Ny*Nz;
-  TypeParam *a = new TypeParam[N];
+  const int N  = Nx * Ny * Nz;
+  TypeParam* a = new TypeParam[N];
 
-  int iter{0};
-  for(TypeParam i=0; i<TypeParam{N}; ++i)
+  int iter {0};
+  for (TypeParam i = 0; i < TypeParam {N}; ++i)
   {
-    a[iter] = TypeParam{i};
+    a[iter] = TypeParam {i};
     ++iter;
   }
 
   /*
    * 1D Typed Accessor
    */
-  RAJA::TypedView<TypeParam, RAJA::Layout<1>, TypeParam> view_1D(a,N);
-  TypeParam val{0};
-  for(TypeParam i=0; i<N; ++i) {
+  RAJA::TypedView<TypeParam, RAJA::Layout<1>, TypeParam> view_1D(a, N);
+  TypeParam val {0};
+  for (TypeParam i = 0; i < N; ++i)
+  {
     ASSERT_EQ(val, view_1D(i));
     val++;
   }
@@ -140,11 +150,13 @@ TYPED_TEST(TypedIntegralViewUnitTest, TypedAccessor)
   /*
    * 2D Typed Accessor
    */
-  RAJA::View<TypeParam, RAJA::Layout<2>> view_2D(a,Ny,Nx);
-  val = TypeParam{0};
-  for(TypeParam j=0; j<Ny; ++j) {
-    for(TypeParam i=0; i<Nx; ++i) {
-      ASSERT_EQ(val, view_2D(j,i));
+  RAJA::View<TypeParam, RAJA::Layout<2>> view_2D(a, Ny, Nx);
+  val = TypeParam {0};
+  for (TypeParam j = 0; j < Ny; ++j)
+  {
+    for (TypeParam i = 0; i < Nx; ++i)
+    {
+      ASSERT_EQ(val, view_2D(j, i));
       val++;
     }
   }
@@ -152,12 +164,15 @@ TYPED_TEST(TypedIntegralViewUnitTest, TypedAccessor)
   /*
    * 3D Typed Accessor
    */
-  RAJA::View<TypeParam, RAJA::Layout<3>> view_3D(a,Nz,Ny,Nx);
-  val = TypeParam{0};
-  for(TypeParam k=0; k<Nz; ++k) {
-    for(TypeParam j=0; j<Ny; ++j) {
-      for(TypeParam i=0; i<Nx; ++i) {
-        ASSERT_EQ(val, view_3D(k,j,i));
+  RAJA::View<TypeParam, RAJA::Layout<3>> view_3D(a, Nz, Ny, Nx);
+  val = TypeParam {0};
+  for (TypeParam k = 0; k < Nz; ++k)
+  {
+    for (TypeParam j = 0; j < Ny; ++j)
+    {
+      for (TypeParam i = 0; i < Nx; ++i)
+      {
+        ASSERT_EQ(val, view_3D(k, j, i));
         val++;
       }
     }
@@ -175,11 +190,13 @@ TYPED_TEST(OffsetLayoutViewUnitTest, View)
   /*
    * View is constructed by passing in the layout.
    */
-  std::array<RAJA::Index_type, 1> lower{{1}};
-  std::array<RAJA::Index_type, 1> upper{{11}};
-  RAJA::View<TypeParam, layout> view(data, RAJA::make_offset_layout<1>(lower, upper));
+  std::array<RAJA::Index_type, 1> lower {{1}};
+  std::array<RAJA::Index_type, 1> upper {{11}};
+  RAJA::View<TypeParam, layout> view(data,
+                                     RAJA::make_offset_layout<1>(lower, upper));
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++)
+  {
     data[i] = static_cast<TypeParam>(i);
   }
 
@@ -192,20 +209,21 @@ TYPED_TEST(OffsetLayoutViewUnitTest, View)
 TYPED_TEST(TypedViewUnitTest, Shift1D)
 {
 
-  int N = 10;
-  TypeParam *a = new TypeParam[N];
-  TypeParam *b = new TypeParam[N];
+  int N        = 10;
+  TypeParam* a = new TypeParam[N];
+  TypeParam* b = new TypeParam[N];
 
   /*
    * Create a view from a base view
    */
-  const int DIM = 1;
-  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0}},{{N}});
-  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> A(a,layout);
-  RAJA::View<TypeParam, RAJA::Layout<DIM>> B(a,N);
-  RAJA::TypedView<TypeParam, RAJA::Layout<DIM>,TX> C(a,N);
+  const int DIM                  = 1;
+  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0}}, {{N}});
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> A(a, layout);
+  RAJA::View<TypeParam, RAJA::Layout<DIM>> B(a, N);
+  RAJA::TypedView<TypeParam, RAJA::Layout<DIM>, TX> C(a, N);
 
-  for(int i=0; i<N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     A(i) = static_cast<TypeParam>(i + 1);
   }
 
@@ -215,23 +233,24 @@ TYPED_TEST(TypedViewUnitTest, Shift1D)
   /*
    * Create a view from a base view with an offsetlayout
    */
-  RAJA::TypedView<TypeParam, RAJA::OffsetLayout<DIM>, TX> Cshift = C.shift({{N}});
+  RAJA::TypedView<TypeParam, RAJA::OffsetLayout<DIM>, TX> Cshift =
+      C.shift({{N}});
 
-  for(int i=N; i<2*N; ++i)
+  for (int i = N; i < 2 * N; ++i)
   {
-    ASSERT_EQ(Ashift(i),A(i-N));
-    ASSERT_EQ(Bshift(i),B(i-N));
+    ASSERT_EQ(Ashift(i), A(i - N));
+    ASSERT_EQ(Bshift(i), B(i - N));
   }
 
-  for(TX tx=TX{N}; tx<TX{2*N}; tx++)
+  for (TX tx = TX {N}; tx < TX {2 * N}; tx++)
   {
-    ASSERT_EQ(Cshift(tx),C(tx-N));
+    ASSERT_EQ(Cshift(tx), C(tx - N));
   }
 
   /*
    * Create a shifted view from a view with a typed layout
    */
-  using TLayout = RAJA::TypedLayout<TIL, RAJA::tuple<TIX>>;
+  using TLayout       = RAJA::TypedLayout<TIL, RAJA::tuple<TIX>>;
   using TOffsetLayout = RAJA::TypedOffsetLayout<TIL, RAJA::tuple<TIX>>;
 
   TLayout myLayout(10);
@@ -239,61 +258,67 @@ TYPED_TEST(TypedViewUnitTest, Shift1D)
   RAJA::View<TypeParam, TLayout> D(a, myLayout);
   RAJA::View<TypeParam, TOffsetLayout> Dshift = D.shift({{N}});
 
-  for(TIX i=TIX{N}; i<TIX{2*N}; ++i)
+  for (TIX i = TIX {N}; i < TIX {2 * N}; ++i)
   {
-    ASSERT_EQ(Dshift(i),D(i-N));
+    ASSERT_EQ(Dshift(i), D(i - N));
   };
 
   delete[] a;
   delete[] b;
-
 }
 
 
 TYPED_TEST(TypedViewUnitTest, Shift2D)
 {
 
-  int N = 10;
-  TypeParam *a = new TypeParam[N*N];
-  TypeParam *b = new TypeParam[N*N];
+  int N        = 10;
+  TypeParam* a = new TypeParam[N * N];
+  TypeParam* b = new TypeParam[N * N];
 
   const int DIM = 2;
-  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0,0}},{{N,N}});
-  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> A(a,layout);
-  RAJA::View<TypeParam, RAJA::Layout<DIM>> B(a,N,N);
+  RAJA::OffsetLayout<DIM> layout =
+      RAJA::make_offset_layout<DIM>({{0, 0}}, {{N, N}});
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> A(a, layout);
+  RAJA::View<TypeParam, RAJA::Layout<DIM>> B(a, N, N);
 
-  for(int y=0; y<N; ++y) {
-    for(int x=0; x<N; ++x) {
-      A(y,x) = static_cast<TypeParam>(x + N*y);
+  for (int y = 0; y < N; ++y)
+  {
+    for (int x = 0; x < N; ++x)
+    {
+      A(y, x) = static_cast<TypeParam>(x + N * y);
     }
   }
 
   /*
    * Create a view from a base view with an offsetlayout
    */
-  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> Ashift = A.shift({{N,N}});
-  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> Bshift = B.shift({{N,N}});
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> Ashift = A.shift({{N, N}});
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> Bshift = B.shift({{N, N}});
 
-  for(int y=N; y<N+N; ++y) {
-    for(int x=N; x<N+N; ++x) {
-      ASSERT_EQ(Ashift(y,x),A(y-N,x-N));
-      ASSERT_EQ(Bshift(y,x),B(y-N,x-N));
+  for (int y = N; y < N + N; ++y)
+  {
+    for (int x = N; x < N + N; ++x)
+    {
+      ASSERT_EQ(Ashift(y, x), A(y - N, x - N));
+      ASSERT_EQ(Bshift(y, x), B(y - N, x - N));
     }
   }
 
   /*
    * Create a view from a base view with permuted layout
    */
-  std::array< RAJA::idx_t, 2> perm {{1, 0}};
+  std::array<RAJA::idx_t, 2> perm {{1, 0}};
   RAJA::OffsetLayout<2> playout =
-    RAJA::make_permuted_offset_layout<2>( {{0, 0}}, {{N, N}}, perm );
+      RAJA::make_permuted_offset_layout<2>({{0, 0}}, {{N, N}}, perm);
 
   RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> C(a, playout);
-  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> Cshift = C.shift({{N,N}});
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> Cshift = C.shift({{N, N}});
 
-  for(int y=N; y<N+N; ++y) {
-    for(int x=N; x<N+N; ++x) {
-      ASSERT_EQ(Cshift(y,x),C(y-N,x-N));
+  for (int y = N; y < N + N; ++y)
+  {
+    for (int x = N; x < N + N; ++x)
+    {
+      ASSERT_EQ(Cshift(y, x), C(y - N, x - N));
     }
   }
 
diff --git a/test/unit/workgroup/tests/test-util-workgroup-Enqueue.hpp b/test/unit/workgroup/tests/test-util-workgroup-Enqueue.hpp
index 7797ce9947..fb6dd0786e 100644
--- a/test/unit/workgroup/tests/test-util-workgroup-Enqueue.hpp
+++ b/test/unit/workgroup/tests/test-util-workgroup-Enqueue.hpp
@@ -17,19 +17,15 @@
 #include <random>
 
 
-template < typename IndexType,
-           typename ... Args >
+template <typename IndexType, typename... Args>
 struct EnqueueTestCallable
 {
-  EnqueueTestCallable(IndexType* _ptr, IndexType _val)
-    : ptr(_ptr)
-    , val(_val)
-  { }
+  EnqueueTestCallable(IndexType* _ptr, IndexType _val) : ptr(_ptr), val(_val) {}
 
-  EnqueueTestCallable(EnqueueTestCallable const&) = default;
+  EnqueueTestCallable(EnqueueTestCallable const&)            = default;
   EnqueueTestCallable& operator=(EnqueueTestCallable const&) = default;
 
-  EnqueueTestCallable(EnqueueTestCallable&& o) = default;
+  EnqueueTestCallable(EnqueueTestCallable&& o)            = default;
   EnqueueTestCallable& operator=(EnqueueTestCallable&& o) = default;
 
   RAJA_HOST_DEVICE void operator()(IndexType i, Args... args) const
@@ -40,7 +36,7 @@ struct EnqueueTestCallable
 
 private:
   IndexType* ptr;
-  IndexType  val;
+  IndexType val;
 };
 
 #endif  //__TEST_UTIL_WORKGROUP_ENQUEUE__
diff --git a/test/unit/workgroup/tests/test-util-workgroup-WorkStorage.hpp b/test/unit/workgroup/tests/test-util-workgroup-WorkStorage.hpp
index 5fa93fbf60..a93c932ec2 100644
--- a/test/unit/workgroup/tests/test-util-workgroup-WorkStorage.hpp
+++ b/test/unit/workgroup/tests/test-util-workgroup-WorkStorage.hpp
@@ -19,58 +19,59 @@
 #include <cstddef>
 
 
-template < typename T >
+template <typename T>
 struct TestCallable
 {
-  TestCallable(T _val)
-    : val(_val)
-  { }
+  TestCallable(T _val) : val(_val) {}
 
-  TestCallable(TestCallable const&) = delete;
+  TestCallable(TestCallable const&)            = delete;
   TestCallable& operator=(TestCallable const&) = delete;
 
-  TestCallable(TestCallable&& o)
-    : val(o.val)
-    , move_constructed(true)
+  TestCallable(TestCallable&& o) : val(o.val), move_constructed(true)
   {
     o.moved_from = true;
   }
 
   TestCallable& operator=(TestCallable&& o)
   {
-    val = o.val;
+    val          = o.val;
     o.moved_from = true;
     return *this;
   }
 
-  RAJA_HOST_DEVICE void operator()(
-      void* val_ptr, bool* move_constructed_ptr, bool* moved_from_ptr) const
+  RAJA_HOST_DEVICE void operator()(void* val_ptr,
+                                   bool* move_constructed_ptr,
+                                   bool* moved_from_ptr) const
   {
     *static_cast<T*>(val_ptr) = val;
-    *move_constructed_ptr = move_constructed;
-    *moved_from_ptr = moved_from;
+    *move_constructed_ptr     = move_constructed;
+    *moved_from_ptr           = moved_from;
   }
 
 private:
   T val;
+
 public:
   bool move_constructed = false;
-  bool moved_from = false;
+  bool moved_from       = false;
 };
 
 
 // work around inconsistent std::array support over stl versions
-template < typename T, size_t N >
+template <typename T, size_t N>
 struct TestArray
 {
-  T a[N]{};
+  T a[N] {};
   T& operator[](size_t i) { return a[i]; }
   T const& operator[](size_t i) const { return a[i]; }
   friend inline bool operator==(TestArray const& lhs, TestArray const& rhs)
   {
-    for (size_t i = 0; i < N; ++i) {
-      if (lhs[i] == rhs[i]) continue;
-      else return false;
+    for (size_t i = 0; i < N; ++i)
+    {
+      if (lhs[i] == rhs[i])
+        continue;
+      else
+        return false;
     }
     return true;
   }
diff --git a/test/unit/workgroup/tests/test-workgroup-Constructor.hpp b/test/unit/workgroup/tests/test-workgroup-Constructor.hpp
index 253015c5b8..7dbac1403d 100644
--- a/test/unit/workgroup/tests/test-workgroup-Constructor.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Constructor.hpp
@@ -20,102 +20,95 @@ template <typename ExecPolicy,
           typename StoragePolicy,
           typename DispatchTyper,
           typename IndexType,
-          typename Allocator
-          >
-struct testWorkGroupConstructorSingle {
-template < typename ... Xargs >
-void operator()(RAJA::xargs<Xargs...>) const
+          typename Allocator>
+struct testWorkGroupConstructorSingle
 {
-  bool success = true;
+  template <typename... Xargs>
+  void operator()(RAJA::xargs<Xargs...>) const
+  {
+    bool success = true;
 
-  using DispatchPolicy = typename DispatchTyper::template type<>;
+    using DispatchPolicy = typename DispatchTyper::template type<>;
 
-  {
-    RAJA::WorkPool<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                    IndexType,
-                    RAJA::xargs<Xargs...>,
-                    Allocator
-                  >
-        pool(Allocator{});
-
-    ASSERT_EQ(pool.num_loops(), (size_t)0);
-    ASSERT_EQ(pool.storage_bytes(), (size_t)0);
-
-    RAJA::WorkGroup<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                    IndexType,
-                    RAJA::xargs<Xargs...>,
-                    Allocator
-                  >
-        group = pool.instantiate();
-
-    ASSERT_EQ(pool.num_loops(), (size_t)0);
-    ASSERT_EQ(pool.storage_bytes(), (size_t)0);
-
-    RAJA::WorkSite<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                    IndexType,
-                    RAJA::xargs<Xargs...>,
-                    Allocator
-                  >
-        site = group.run(Xargs{}...);
-
-    using resource_type = typename RAJA::WorkPool<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                    IndexType,
-                    RAJA::xargs<Xargs...>,
-                    Allocator
-                  >::resource_type;
-    auto e = resource_type::get_default().get_event();
-    e.wait();
-
-    pool.clear();
-    group.clear();
-    site.clear();
-
-    ASSERT_EQ(pool.num_loops(), (size_t)0);
-    ASSERT_EQ(pool.storage_bytes(), (size_t)0);
-  }
+    {
+      RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                           StoragePolicy, DispatchPolicy>,
+                     IndexType, RAJA::xargs<Xargs...>, Allocator>
+          pool(Allocator {});
 
-  ASSERT_TRUE(success);
-}
+      ASSERT_EQ(pool.num_loops(), (size_t)0);
+      ASSERT_EQ(pool.storage_bytes(), (size_t)0);
+
+      RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                            StoragePolicy, DispatchPolicy>,
+                      IndexType, RAJA::xargs<Xargs...>, Allocator>
+          group = pool.instantiate();
+
+      ASSERT_EQ(pool.num_loops(), (size_t)0);
+      ASSERT_EQ(pool.storage_bytes(), (size_t)0);
+
+      RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                           StoragePolicy, DispatchPolicy>,
+                     IndexType, RAJA::xargs<Xargs...>, Allocator>
+          site = group.run(Xargs {}...);
+
+      using resource_type = typename RAJA::WorkPool<
+          RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy,
+                                DispatchPolicy>,
+          IndexType, RAJA::xargs<Xargs...>, Allocator>::resource_type;
+      auto e = resource_type::get_default().get_event();
+      e.wait();
+
+      pool.clear();
+      group.clear();
+      site.clear();
+
+      ASSERT_EQ(pool.num_loops(), (size_t)0);
+      ASSERT_EQ(pool.storage_bytes(), (size_t)0);
+    }
+
+    ASSERT_TRUE(success);
+  }
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
-          typename Allocator
-          >
-struct testWorkGroupConstructorSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                      RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                      StoragePolicy,
-                                      detail::indirect_function_call_dispatch_typer,
-                                      IndexType,
-                                      Allocator> {
-template < typename ... Xargs >
-void operator()(RAJA::xargs<Xargs...>) const
-{ }
+          typename Allocator>
+struct testWorkGroupConstructorSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_function_call_dispatch_typer,
+    IndexType,
+    Allocator>
+{
+  template <typename... Xargs>
+  void operator()(RAJA::xargs<Xargs...>) const
+  {}
 };
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
-          typename Allocator
-          >
-struct testWorkGroupConstructorSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                      RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                      StoragePolicy,
-                                      detail::indirect_virtual_function_dispatch_typer,
-                                      IndexType,
-                                      Allocator> {
-template < typename ... Xargs >
-void operator()(RAJA::xargs<Xargs...>) const
-{ }
+          typename Allocator>
+struct testWorkGroupConstructorSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_virtual_function_dispatch_typer,
+    IndexType,
+    Allocator>
+{
+  template <typename... Xargs>
+  void operator()(RAJA::xargs<Xargs...>) const
+  {}
 };
 
 #endif
@@ -123,23 +116,25 @@ void operator()(RAJA::xargs<Xargs...>) const
 
 template <typename T>
 class WorkGroupBasicConstructorSingleUnitTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_SUITE_P(WorkGroupBasicConstructorSingleUnitTest);
 
-TYPED_TEST_P(WorkGroupBasicConstructorSingleUnitTest, BasicWorkGroupConstructorSingle)
+TYPED_TEST_P(WorkGroupBasicConstructorSingleUnitTest,
+             BasicWorkGroupConstructorSingle)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ExecPolicy    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy   = typename camp::at<TypeParam, camp::num<1>>::type;
   using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Xargs = typename camp::at<TypeParam, camp::num<5>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<6>>::type;
+  using IndexType     = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Xargs         = typename camp::at<TypeParam, camp::num<5>>::type;
+  using Allocator     = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  testWorkGroupConstructorSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator >{}(Xargs{});
+  testWorkGroupConstructorSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                                 DispatchTyper, IndexType, Allocator> {}(
+      Xargs {});
 }
 
 #endif  //__TEST_WORKGROUP_CONSTRUCTOR__
diff --git a/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp b/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp
index 843f3b17a6..76016b7bee 100644
--- a/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp
@@ -15,75 +15,60 @@
 #include "RAJA_test-workgroup.hpp"
 
 
-template  < typename ForOnePol,
-            typename Invoker,
-            typename ... CallArgs >
-typename  std::enable_if<
-            !std::is_base_of<RunOnDevice, ForOnePol>::value
-          >::type
-call_dispatcher( Invoker invoker,
-                 CallArgs... callArgs )
+template <typename ForOnePol, typename Invoker, typename... CallArgs>
+typename std::enable_if<!std::is_base_of<RunOnDevice, ForOnePol>::value>::type
+call_dispatcher(Invoker invoker, CallArgs... callArgs)
 {
-  forone<ForOnePol>( [=] () {
-    invoker(callArgs...);
-  });
+  forone<ForOnePol>([=]() { invoker(callArgs...); });
 }
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-template  < typename ForOnePol,
-            typename Invoker,
-            typename ... CallArgs >
-typename  std::enable_if<
-            std::is_base_of<RunOnDevice, ForOnePol>::value
-          >::type
-call_dispatcher( Invoker invoker,
-                 CallArgs... callArgs )
+template <typename ForOnePol, typename Invoker, typename... CallArgs>
+typename std::enable_if<std::is_base_of<RunOnDevice, ForOnePol>::value>::type
+call_dispatcher(Invoker invoker, CallArgs... callArgs)
 {
   RAJA::tuple<CallArgs...> lambda_capturable_callArgs(callArgs...);
-  forone<ForOnePol>( [=] RAJA_DEVICE () {
-    camp::invoke(lambda_capturable_callArgs, invoker);
-  });
+  forone<ForOnePol>([=] RAJA_DEVICE()
+                    { camp::invoke(lambda_capturable_callArgs, invoker); });
 }
 #endif
 
-template < typename IndexType,
-           typename ... Args >
+template <typename IndexType, typename... Args>
 struct DispatcherTestCallable
 {
-  DispatcherTestCallable(IndexType* _ptr_call, IndexType _val_call,
-                     IndexType* _ptr_dtor, IndexType _val_dtor)
-    : ptr_call(_ptr_call)
-    , val_call(_val_call)
-    , ptr_dtor(_ptr_dtor)
-    , val_dtor(_val_dtor)
-  { }
-
-  DispatcherTestCallable(DispatcherTestCallable const&) = delete;
+  DispatcherTestCallable(IndexType* _ptr_call,
+                         IndexType _val_call,
+                         IndexType* _ptr_dtor,
+                         IndexType _val_dtor)
+      : ptr_call(_ptr_call),
+        val_call(_val_call),
+        ptr_dtor(_ptr_dtor),
+        val_dtor(_val_dtor)
+  {}
+
+  DispatcherTestCallable(DispatcherTestCallable const&)            = delete;
   DispatcherTestCallable& operator=(DispatcherTestCallable const&) = delete;
 
   DispatcherTestCallable(DispatcherTestCallable&& o)
-    : ptr_call(o.ptr_call)
-    , val_call(o.val_call)
-    , ptr_dtor(o.ptr_dtor)
-    , val_dtor(o.val_dtor)
-    , move_constructed(true)
+      : ptr_call(o.ptr_call),
+        val_call(o.val_call),
+        ptr_dtor(o.ptr_dtor),
+        val_dtor(o.val_dtor),
+        move_constructed(true)
   {
     o.moved_from = true;
   }
   DispatcherTestCallable& operator=(DispatcherTestCallable&& o)
   {
-    ptr_call = o.ptr_call;
-    val_call = o.val_call;
-    ptr_dtor = o.ptr_dtor;
-    val_dtor = o.val_dtor;
+    ptr_call     = o.ptr_call;
+    val_call     = o.val_call;
+    ptr_dtor     = o.ptr_dtor;
+    val_dtor     = o.val_dtor;
     o.moved_from = true;
     return *this;
   }
 
-  ~DispatcherTestCallable()
-  {
-    *ptr_dtor = val_dtor;
-  }
+  ~DispatcherTestCallable() { *ptr_dtor = val_dtor; }
 
   RAJA_HOST_DEVICE void operator()(IndexType i, Args... args) const
   {
@@ -93,156 +78,165 @@ struct DispatcherTestCallable
 
 private:
   IndexType* ptr_call;
-  IndexType  val_call;
+  IndexType val_call;
   IndexType* ptr_dtor;
-  IndexType  val_dtor;
+  IndexType val_dtor;
+
 public:
   bool move_constructed = false;
-  bool moved_from = false;
+  bool moved_from       = false;
 };
 
-template < typename ExecPolicy,
-           typename DispatchTyper,
-           typename IndexType,
-           typename WORKING_RES,
-           typename ForOnePol >
-struct testWorkGroupDispatcherSingle {
-template < typename ... Args >
-void operator()(RAJA::xargs<Args...>) const
+template <typename ExecPolicy,
+          typename DispatchTyper,
+          typename IndexType,
+          typename WORKING_RES,
+          typename ForOnePol>
+struct testWorkGroupDispatcherSingle
 {
-  using TestCallable = DispatcherTestCallable<IndexType, Args...>;
+  template <typename... Args>
+  void operator()(RAJA::xargs<Args...>) const
+  {
+    using TestCallable = DispatcherTestCallable<IndexType, Args...>;
 
-  camp::resources::Resource work_res{WORKING_RES()};
-  camp::resources::Resource host_res{camp::resources::Host()};
+    camp::resources::Resource work_res {WORKING_RES()};
+    camp::resources::Resource host_res {camp::resources::Host()};
 
-  static constexpr auto platform = RAJA::platform_of<ExecPolicy>::value;
-  using DispatchPolicy = typename DispatchTyper::template type<TestCallable>;
-  using Dispatcher_type = RAJA::detail::Dispatcher<
-      platform, DispatchPolicy, void, IndexType, Args...>;
-  using Invoker_type = typename Dispatcher_type::invoker_type;
-  using Dispatcher_cptr_type = typename Dispatcher_type::void_cptr_wrapper;
-  const Dispatcher_type* dispatcher =
-      RAJA::detail::get_Dispatcher<TestCallable, Dispatcher_type>(ExecPolicy{});
+    static constexpr auto platform = RAJA::platform_of<ExecPolicy>::value;
+    using DispatchPolicy  = typename DispatchTyper::template type<TestCallable>;
+    using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
+                                                     void, IndexType, Args...>;
+    using Invoker_type    = typename Dispatcher_type::invoker_type;
+    using Dispatcher_cptr_type = typename Dispatcher_type::void_cptr_wrapper;
+    const Dispatcher_type* dispatcher =
+        RAJA::detail::get_Dispatcher<TestCallable, Dispatcher_type>(
+            ExecPolicy {});
 
-  TestCallable* old_obj = host_res.allocate<TestCallable>(1);
-  TestCallable* new_obj = host_res.allocate<TestCallable>(1);
-  TestCallable* wrk_obj = work_res.allocate<TestCallable>(1);
+    TestCallable* old_obj = host_res.allocate<TestCallable>(1);
+    TestCallable* new_obj = host_res.allocate<TestCallable>(1);
+    TestCallable* wrk_obj = work_res.allocate<TestCallable>(1);
 
-  IndexType* chckCall = host_res.allocate<IndexType>(3);
-  IndexType* testCall = host_res.allocate<IndexType>(3);
-  IndexType* workCall = work_res.allocate<IndexType>(3);
+    IndexType* chckCall = host_res.allocate<IndexType>(3);
+    IndexType* testCall = host_res.allocate<IndexType>(3);
+    IndexType* workCall = work_res.allocate<IndexType>(3);
 
-  IndexType* chckDtor = host_res.allocate<IndexType>(3);
-  IndexType* testDtor = host_res.allocate<IndexType>(3);
+    IndexType* chckDtor = host_res.allocate<IndexType>(3);
+    IndexType* testDtor = host_res.allocate<IndexType>(3);
 
 
-  chckCall[0] = (IndexType)5;
-  chckCall[1] = (IndexType)7;
-  chckCall[2] = (IndexType)5;
+    chckCall[0] = (IndexType)5;
+    chckCall[1] = (IndexType)7;
+    chckCall[2] = (IndexType)5;
 
-  testCall[0] = (IndexType)5;
-  testCall[1] = (IndexType)5;
-  testCall[2] = (IndexType)5;
+    testCall[0] = (IndexType)5;
+    testCall[1] = (IndexType)5;
+    testCall[2] = (IndexType)5;
 
-  work_res.memcpy(workCall, testCall, sizeof(IndexType) * 3);
+    work_res.memcpy(workCall, testCall, sizeof(IndexType) * 3);
 
-  testCall[0] = (IndexType)0;
-  testCall[1] = (IndexType)0;
-  testCall[2] = (IndexType)0;
+    testCall[0] = (IndexType)0;
+    testCall[1] = (IndexType)0;
+    testCall[2] = (IndexType)0;
 
 
-  chckDtor[0] = (IndexType)15;
-  chckDtor[1] = (IndexType)17;
-  chckDtor[2] = (IndexType)15;
+    chckDtor[0] = (IndexType)15;
+    chckDtor[1] = (IndexType)17;
+    chckDtor[2] = (IndexType)15;
 
-  testDtor[0] = (IndexType)15;
-  testDtor[1] = (IndexType)15;
-  testDtor[2] = (IndexType)15;
+    testDtor[0] = (IndexType)15;
+    testDtor[1] = (IndexType)15;
+    testDtor[2] = (IndexType)15;
 
 
-  new(old_obj) TestCallable(workCall, chckCall[1], testDtor+1, chckDtor[1]);
+    new (old_obj)
+        TestCallable(workCall, chckCall[1], testDtor + 1, chckDtor[1]);
 
-  ASSERT_FALSE(old_obj->move_constructed);
-  ASSERT_FALSE(old_obj->moved_from);
+    ASSERT_FALSE(old_obj->move_constructed);
+    ASSERT_FALSE(old_obj->moved_from);
 
 
-  dispatcher->move_construct_destroy(new_obj, old_obj);
+    dispatcher->move_construct_destroy(new_obj, old_obj);
 
-  ASSERT_TRUE(new_obj->move_constructed);
-  ASSERT_FALSE(new_obj->moved_from);
+    ASSERT_TRUE(new_obj->move_constructed);
+    ASSERT_FALSE(new_obj->moved_from);
 
-  ASSERT_EQ(testDtor[0], chckDtor[0]);
-  ASSERT_EQ(testDtor[1], chckDtor[1]);
-  ASSERT_EQ(testDtor[2], chckDtor[2]);
+    ASSERT_EQ(testDtor[0], chckDtor[0]);
+    ASSERT_EQ(testDtor[1], chckDtor[1]);
+    ASSERT_EQ(testDtor[2], chckDtor[2]);
 
-  testDtor[0] = (IndexType)15;
-  testDtor[1] = (IndexType)15;
-  testDtor[2] = (IndexType)15;
+    testDtor[0] = (IndexType)15;
+    testDtor[1] = (IndexType)15;
+    testDtor[2] = (IndexType)15;
 
 
-  work_res.memcpy(wrk_obj, new_obj, sizeof(TestCallable) * 1);
+    work_res.memcpy(wrk_obj, new_obj, sizeof(TestCallable) * 1);
 
-  // move a value onto device and fiddle
-  call_dispatcher<ForOnePol, Invoker_type, Dispatcher_cptr_type, IndexType, Args...>(
-      dispatcher->invoke, wrk_obj, (IndexType)1, Args{}...);
+    // move a value onto device and fiddle
+    call_dispatcher<ForOnePol, Invoker_type, Dispatcher_cptr_type, IndexType,
+                    Args...>(dispatcher->invoke, wrk_obj, (IndexType)1,
+                             Args {}...);
 
-  work_res.memcpy(testCall, workCall, sizeof(IndexType) * 3);
+    work_res.memcpy(testCall, workCall, sizeof(IndexType) * 3);
 
-  ASSERT_EQ(testCall[0], chckCall[0]);
-  ASSERT_EQ(testCall[1], chckCall[1]);
-  ASSERT_EQ(testCall[2], chckCall[2]);
+    ASSERT_EQ(testCall[0], chckCall[0]);
+    ASSERT_EQ(testCall[1], chckCall[1]);
+    ASSERT_EQ(testCall[2], chckCall[2]);
 
 
-  dispatcher->destroy(new_obj);
+    dispatcher->destroy(new_obj);
 
-  ASSERT_EQ(testDtor[0], chckDtor[0]);
-  ASSERT_EQ(testDtor[1], chckDtor[1]);
-  ASSERT_EQ(testDtor[2], chckDtor[2]);
+    ASSERT_EQ(testDtor[0], chckDtor[0]);
+    ASSERT_EQ(testDtor[1], chckDtor[1]);
+    ASSERT_EQ(testDtor[2], chckDtor[2]);
 
 
-  host_res.deallocate( old_obj );
-  host_res.deallocate( new_obj );
-  work_res.deallocate( wrk_obj );
-  host_res.deallocate( chckCall );
-  host_res.deallocate( testCall );
-  work_res.deallocate( workCall );
-  host_res.deallocate( chckDtor );
-  host_res.deallocate( testDtor );
-}
+    host_res.deallocate(old_obj);
+    host_res.deallocate(new_obj);
+    work_res.deallocate(wrk_obj);
+    host_res.deallocate(chckCall);
+    host_res.deallocate(testCall);
+    work_res.deallocate(workCall);
+    host_res.deallocate(chckDtor);
+    host_res.deallocate(testDtor);
+  }
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename IndexType,
-           typename WORKING_RES,
-          typename ForOnePol
-          >
-struct testWorkGroupDispatcherSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                     detail::indirect_function_call_dispatch_typer,
-                                     IndexType,
-                                     WORKING_RES,
-                                     ForOnePol> {
-template < typename ... Args >
-void operator()(RAJA::xargs<Args...>) const
-{ }
+          typename WORKING_RES,
+          typename ForOnePol>
+struct testWorkGroupDispatcherSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    detail::indirect_function_call_dispatch_typer,
+    IndexType,
+    WORKING_RES,
+    ForOnePol>
+{
+  template <typename... Args>
+  void operator()(RAJA::xargs<Args...>) const
+  {}
 };
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename IndexType,
-           typename WORKING_RES,
-          typename ForOnePol
-          >
-struct testWorkGroupDispatcherSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                     detail::indirect_virtual_function_dispatch_typer,
-                                     IndexType,
-                                     WORKING_RES,
-                                     ForOnePol> {
-template < typename ... Args >
-void operator()(RAJA::xargs<Args...>) const
-{ }
+          typename WORKING_RES,
+          typename ForOnePol>
+struct testWorkGroupDispatcherSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    detail::indirect_virtual_function_dispatch_typer,
+    IndexType,
+    WORKING_RES,
+    ForOnePol>
+{
+  template <typename... Args>
+  void operator()(RAJA::xargs<Args...>) const
+  {}
 };
 
 #endif
@@ -250,22 +244,22 @@ void operator()(RAJA::xargs<Args...>) const
 
 template <typename T>
 class WorkGroupBasicDispatcherSingleUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicDispatcherSingleUnitTest);
 
-TYPED_TEST_P(WorkGroupBasicDispatcherSingleUnitTest, BasicWorkGroupDispatcherSingle)
+TYPED_TEST_P(WorkGroupBasicDispatcherSingleUnitTest,
+             BasicWorkGroupDispatcherSingle)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ExecPolicy    = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<2>>::type;
-  using Args = typename camp::at<TypeParam, camp::num<3>>::type;
-  using ResourceType = typename camp::at<TypeParam, camp::num<4>>::type;
-  using ForOneType = typename camp::at<TypeParam, camp::num<5>>::type;
+  using IndexType     = typename camp::at<TypeParam, camp::num<2>>::type;
+  using Args          = typename camp::at<TypeParam, camp::num<3>>::type;
+  using ResourceType  = typename camp::at<TypeParam, camp::num<4>>::type;
+  using ForOneType    = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  testWorkGroupDispatcherSingle< ExecPolicy, DispatchTyper, IndexType, ResourceType, ForOneType >{}(
-      Args{});
+  testWorkGroupDispatcherSingle<ExecPolicy, DispatchTyper, IndexType,
+                                ResourceType, ForOneType> {}(Args {});
 }
 
 #endif  //__TEST_WORKGROUP_DISPATCHER__
diff --git a/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp b/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp
index fcf24e89da..6b7572af83 100644
--- a/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp
@@ -23,104 +23,110 @@ template <typename ExecPolicy,
           typename StoragePolicy,
           typename DispatchTyper,
           typename IndexType,
-          typename Allocator
-          >
-struct testWorkGroupEnqueueMultiple {
-template < typename ... Args >
-void operator()(
-    RAJA::xargs<Args...>, bool do_instantiate, size_t rep, size_t num) const
+          typename Allocator>
+struct testWorkGroupEnqueueMultiple
 {
-  IndexType success = (IndexType)1;
-
-  using range_segment = RAJA::TypedRangeSegment<IndexType>;
-  using callable = EnqueueTestCallable<IndexType, Args...>;
+  template <typename... Args>
+  void operator()(RAJA::xargs<Args...>,
+                  bool do_instantiate,
+                  size_t rep,
+                  size_t num) const
+  {
+    IndexType success = (IndexType)1;
 
-  using DispatchPolicy = typename DispatchTyper::template type<
-      camp::list<range_segment, callable> >;
+    using range_segment = RAJA::TypedRangeSegment<IndexType>;
+    using callable      = EnqueueTestCallable<IndexType, Args...>;
 
-  using WorkPool_type = RAJA::WorkPool<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                    IndexType,
-                    RAJA::xargs<Args...>,
-                    Allocator
-                  >;
+    using DispatchPolicy = typename DispatchTyper::template type<
+        camp::list<range_segment, callable>>;
 
-  using WorkGroup_type = RAJA::WorkGroup<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                    IndexType,
-                    RAJA::xargs<Args...>,
-                    Allocator
-                  >;
+    using WorkPool_type =
+        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<Args...>, Allocator>;
 
-  {
-    WorkPool_type pool(Allocator{});
+    using WorkGroup_type =
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                              StoragePolicy, DispatchPolicy>,
+                        IndexType, RAJA::xargs<Args...>, Allocator>;
 
-    // test_empty(pool);
-    ASSERT_EQ(pool.num_loops(), (size_t)0);
-    ASSERT_EQ(pool.storage_bytes(), (size_t)0);
+    {
+      WorkPool_type pool(Allocator {});
 
-    for (size_t i = 0; i < rep; ++i) {
+      // test_empty(pool);
+      ASSERT_EQ(pool.num_loops(), (size_t)0);
+      ASSERT_EQ(pool.storage_bytes(), (size_t)0);
 
+      for (size_t i = 0; i < rep; ++i)
       {
-        for (size_t i = 0; i < num; ++i) {
-          pool.enqueue(range_segment{0, 1}, callable{&success, IndexType(0)});
+
+        {
+          for (size_t i = 0; i < num; ++i)
+          {
+            pool.enqueue(range_segment {0, 1},
+                         callable {&success, IndexType(0)});
+          }
+
+          ASSERT_EQ(pool.num_loops(), (size_t)num);
+          ASSERT_GE(pool.storage_bytes(), num * sizeof(callable));
         }
 
-        ASSERT_EQ(pool.num_loops(), (size_t)num);
-        ASSERT_GE(pool.storage_bytes(), num*sizeof(callable));
-      }
+        if (do_instantiate)
+        {
+          WorkGroup_type group = pool.instantiate();
+        }
+        else
+        {
+          pool.clear();
+        }
 
-      if (do_instantiate) {
-        WorkGroup_type group = pool.instantiate();
-      } else {
-        pool.clear();
+        ASSERT_EQ(pool.num_loops(), (size_t)0);
+        ASSERT_EQ(pool.storage_bytes(), (size_t)0);
       }
-
-      ASSERT_EQ(pool.num_loops(), (size_t)0);
-      ASSERT_EQ(pool.storage_bytes(), (size_t)0);
     }
-  }
 
-  ASSERT_EQ(success, (IndexType)1);
-}
+    ASSERT_EQ(success, (IndexType)1);
+  }
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
-          typename Allocator
-          >
-struct testWorkGroupEnqueueMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                    StoragePolicy,
-                                    detail::indirect_function_call_dispatch_typer,
-                                    IndexType,
-                                    Allocator> {
-template < typename ... Args >
-void operator()(
-    RAJA::xargs<Args...>, bool, size_t, size_t) const
-{ }
+          typename Allocator>
+struct testWorkGroupEnqueueMultiple<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_function_call_dispatch_typer,
+    IndexType,
+    Allocator>
+{
+  template <typename... Args>
+  void operator()(RAJA::xargs<Args...>, bool, size_t, size_t) const
+  {}
 };
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
-          typename Allocator
-          >
-struct testWorkGroupEnqueueMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                    StoragePolicy,
-                                    detail::indirect_virtual_function_dispatch_typer,
-                                    IndexType,
-                                    Allocator> {
-template < typename ... Args >
-void operator()(
-    RAJA::xargs<Args...>, bool, size_t, size_t) const
-{ }
+          typename Allocator>
+struct testWorkGroupEnqueueMultiple<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_virtual_function_dispatch_typer,
+    IndexType,
+    Allocator>
+{
+  template <typename... Args>
+  void operator()(RAJA::xargs<Args...>, bool, size_t, size_t) const
+  {}
 };
 
 #endif
@@ -128,30 +134,32 @@ void operator()(
 
 template <typename T>
 class WorkGroupBasicEnqueueMultipleUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicEnqueueMultipleUnitTest);
 
 
-TYPED_TEST_P(WorkGroupBasicEnqueueMultipleUnitTest, BasicWorkGroupEnqueueMultiple)
+TYPED_TEST_P(WorkGroupBasicEnqueueMultipleUnitTest,
+             BasicWorkGroupEnqueueMultiple)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ExecPolicy    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy   = typename camp::at<TypeParam, camp::num<1>>::type;
   using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Xargs = typename camp::at<TypeParam, camp::num<5>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<6>>::type;
+  using IndexType     = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Xargs         = typename camp::at<TypeParam, camp::num<5>>::type;
+  using Allocator     = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  std::mt19937 rng(std::random_device{}());
+  std::mt19937 rng(std::random_device {}());
   std::uniform_int_distribution<size_t> dist_rep(0, 16);
   std::uniform_int_distribution<size_t> dist_num(0, 64);
 
-  testWorkGroupEnqueueMultiple< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator >{}(
-      Xargs{}, false, dist_rep(rng), dist_num(rng));
-  testWorkGroupEnqueueMultiple< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator >{}(
-      Xargs{}, true, dist_rep(rng), dist_num(rng));
+  testWorkGroupEnqueueMultiple<ExecPolicy, OrderPolicy, StoragePolicy,
+                               DispatchTyper, IndexType, Allocator> {}(
+      Xargs {}, false, dist_rep(rng), dist_num(rng));
+  testWorkGroupEnqueueMultiple<ExecPolicy, OrderPolicy, StoragePolicy,
+                               DispatchTyper, IndexType, Allocator> {}(
+      Xargs {}, true, dist_rep(rng), dist_num(rng));
 }
 
 #endif  //__TEST_WORKGROUP_ENQUEUEMULTIPLE__
diff --git a/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp b/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp
index 282b911d93..ee172d7732 100644
--- a/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp
@@ -23,103 +23,110 @@ template <typename ExecPolicy,
           typename StoragePolicy,
           typename DispatchTyper,
           typename IndexType,
-          typename Allocator
-          >
-struct testWorkGroupEnqueueSingle {
-template < typename ... Args >
-void operator()(RAJA::xargs<Args...>, bool do_instantiate, size_t rep, size_t num) const
+          typename Allocator>
+struct testWorkGroupEnqueueSingle
 {
-  IndexType success = (IndexType)1;
-
-  using range_segment = RAJA::TypedRangeSegment<IndexType>;
-  using callable = EnqueueTestCallable<IndexType, Args...>;
+  template <typename... Args>
+  void operator()(RAJA::xargs<Args...>,
+                  bool do_instantiate,
+                  size_t rep,
+                  size_t num) const
+  {
+    IndexType success = (IndexType)1;
 
-  using DispatchPolicy = typename DispatchTyper::template type<
-      camp::list<range_segment, callable> >;
+    using range_segment = RAJA::TypedRangeSegment<IndexType>;
+    using callable      = EnqueueTestCallable<IndexType, Args...>;
 
-  using WorkPool_type = RAJA::WorkPool<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                    IndexType,
-                    RAJA::xargs<Args...>,
-                    Allocator
-                  >;
+    using DispatchPolicy = typename DispatchTyper::template type<
+        camp::list<range_segment, callable>>;
 
-  using WorkGroup_type = RAJA::WorkGroup<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                    IndexType,
-                    RAJA::xargs<Args...>,
-                    Allocator
-                  >;
+    using WorkPool_type =
+        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<Args...>, Allocator>;
 
-  {
-    WorkPool_type pool(Allocator{});
+    using WorkGroup_type =
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                              StoragePolicy, DispatchPolicy>,
+                        IndexType, RAJA::xargs<Args...>, Allocator>;
 
-    // test_empty(pool);
-    ASSERT_EQ(pool.num_loops(), (size_t)0);
-    ASSERT_EQ(pool.storage_bytes(), (size_t)0);
+    {
+      WorkPool_type pool(Allocator {});
 
-    for (size_t i = 0; i < rep; ++i) {
+      // test_empty(pool);
+      ASSERT_EQ(pool.num_loops(), (size_t)0);
+      ASSERT_EQ(pool.storage_bytes(), (size_t)0);
 
+      for (size_t i = 0; i < rep; ++i)
       {
-        for (size_t i = 0; i < num; ++i) {
-          pool.enqueue(range_segment{0, 1}, callable{&success, IndexType(0)});
+
+        {
+          for (size_t i = 0; i < num; ++i)
+          {
+            pool.enqueue(range_segment {0, 1},
+                         callable {&success, IndexType(0)});
+          }
+
+          ASSERT_EQ(pool.num_loops(), (size_t)num);
+          ASSERT_GE(pool.storage_bytes(), num * sizeof(callable));
         }
 
-        ASSERT_EQ(pool.num_loops(), (size_t)num);
-        ASSERT_GE(pool.storage_bytes(), num*sizeof(callable));
-      }
+        if (do_instantiate)
+        {
+          WorkGroup_type group = pool.instantiate();
+        }
+        else
+        {
+          pool.clear();
+        }
 
-      if (do_instantiate) {
-        WorkGroup_type group = pool.instantiate();
-      } else {
-        pool.clear();
+        ASSERT_EQ(pool.num_loops(), (size_t)0);
+        ASSERT_EQ(pool.storage_bytes(), (size_t)0);
       }
-
-      ASSERT_EQ(pool.num_loops(), (size_t)0);
-      ASSERT_EQ(pool.storage_bytes(), (size_t)0);
     }
-  }
 
-  ASSERT_EQ(success, (IndexType)1);
-}
+    ASSERT_EQ(success, (IndexType)1);
+  }
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
-          typename Allocator
-          >
-struct testWorkGroupEnqueueSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                  RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                  StoragePolicy,
-                                  detail::indirect_function_call_dispatch_typer,
-                                  IndexType,
-                                  Allocator> {
-template < typename ... Args >
-void operator()(
-    RAJA::xargs<Args...>, bool, size_t, size_t) const
-{ }
+          typename Allocator>
+struct testWorkGroupEnqueueSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_function_call_dispatch_typer,
+    IndexType,
+    Allocator>
+{
+  template <typename... Args>
+  void operator()(RAJA::xargs<Args...>, bool, size_t, size_t) const
+  {}
 };
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
-          typename Allocator
-          >
-struct testWorkGroupEnqueueSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                  RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                  StoragePolicy,
-                                  detail::indirect_virtual_function_dispatch_typer,
-                                  IndexType,
-                                  Allocator> {
-template < typename ... Args >
-void operator()(
-    RAJA::xargs<Args...>, bool, size_t, size_t) const
-{ }
+          typename Allocator>
+struct testWorkGroupEnqueueSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_virtual_function_dispatch_typer,
+    IndexType,
+    Allocator>
+{
+  template <typename... Args>
+  void operator()(RAJA::xargs<Args...>, bool, size_t, size_t) const
+  {}
 };
 
 #endif
@@ -127,24 +134,27 @@ void operator()(
 
 template <typename T>
 class WorkGroupBasicEnqueueSingleUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicEnqueueSingleUnitTest);
 
 
 TYPED_TEST_P(WorkGroupBasicEnqueueSingleUnitTest, BasicWorkGroupEnqueueSingle)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ExecPolicy    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy   = typename camp::at<TypeParam, camp::num<1>>::type;
   using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Xargs = typename camp::at<TypeParam, camp::num<5>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<6>>::type;
-
-  testWorkGroupEnqueueSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator >{}(Xargs{}, false, 1, 1);
-  testWorkGroupEnqueueSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator >{}(Xargs{}, true, 1, 1);
+  using IndexType     = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Xargs         = typename camp::at<TypeParam, camp::num<5>>::type;
+  using Allocator     = typename camp::at<TypeParam, camp::num<6>>::type;
+
+  testWorkGroupEnqueueSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                             DispatchTyper, IndexType, Allocator> {}(
+      Xargs {}, false, 1, 1);
+  testWorkGroupEnqueueSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                             DispatchTyper, IndexType, Allocator> {}(
+      Xargs {}, true, 1, 1);
 }
 
 #endif  //__TEST_WORKGROUP_ENQUEUESINGLE__
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp
index 6022e98919..603209ecee 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp
@@ -20,32 +20,26 @@
 #include <cstddef>
 
 
-template <typename StoragePolicy,
-          typename DispatchTyper,
-          typename Allocator
-          >
+template <typename StoragePolicy, typename DispatchTyper, typename Allocator>
 void testWorkGroupWorkStorageConstructor()
 {
   bool success = true;
 
   static constexpr auto platform = RAJA::Platform::host;
-  using DispatchPolicy = typename DispatchTyper::template type<>;
-  using Dispatcher_type = RAJA::detail::Dispatcher<
-      platform, DispatchPolicy, void, void*, bool*, bool*>;
-  using WorkStorage_type = RAJA::detail::WorkStorage<
-                                                      StoragePolicy,
-                                                      Allocator,
-                                                      Dispatcher_type
-                                                    >;
+  using DispatchPolicy           = typename DispatchTyper::template type<>;
+  using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
+                                                   void, void*, bool*, bool*>;
+  using WorkStorage_type =
+      RAJA::detail::WorkStorage<StoragePolicy, Allocator, Dispatcher_type>;
 
   {
-    auto test_empty = [&](WorkStorage_type& container) {
-
+    auto test_empty = [&](WorkStorage_type& container)
+    {
       ASSERT_EQ(container.size(), (size_t)(0));
       ASSERT_EQ(container.storage_size(), (size_t)0);
     };
 
-    WorkStorage_type container(Allocator{});
+    WorkStorage_type container(Allocator {});
 
     test_empty(container);
 
@@ -60,7 +54,7 @@ void testWorkGroupWorkStorageConstructor()
     test_empty(container2);
 
 
-    WorkStorage_type container3(Allocator{});
+    WorkStorage_type container3(Allocator {});
     container3 = std::move(container2);
 
     test_empty(container2);
@@ -73,19 +67,20 @@ void testWorkGroupWorkStorageConstructor()
 
 template <typename T>
 class WorkGroupBasicWorkStorageConstructorUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageConstructorUnitTest);
 
 
-TYPED_TEST_P(WorkGroupBasicWorkStorageConstructorUnitTest, BasicWorkGroupWorkStorageConstructor)
+TYPED_TEST_P(WorkGroupBasicWorkStorageConstructorUnitTest,
+             BasicWorkGroupWorkStorageConstructor)
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<2>>::type;
+  using Allocator     = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  testWorkGroupWorkStorageConstructor< StoragePolicy, DispatchTyper, Allocator >();
+  testWorkGroupWorkStorageConstructor<StoragePolicy, DispatchTyper,
+                                      Allocator>();
 }
 
 
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp
index fd5a7aeaa3..d45a8d6ce6 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp
@@ -20,10 +20,7 @@
 #include <cstddef>
 
 
-template <typename StoragePolicy,
-          typename DispatchTyper,
-          typename Allocator
-          >
+template <typename StoragePolicy, typename DispatchTyper, typename Allocator>
 void testWorkGroupWorkStorageInsertCall()
 {
   bool success = true;
@@ -31,28 +28,26 @@ void testWorkGroupWorkStorageInsertCall()
   using callable = TestCallable<double>;
 
   static constexpr auto platform = RAJA::Platform::host;
-  using DispatchPolicy = typename DispatchTyper::template type<callable>;
-  using Dispatcher_type = RAJA::detail::Dispatcher<
-      platform, DispatchPolicy, void, void*, bool*, bool*>;
-  using WorkStorage_type = RAJA::detail::WorkStorage<
-                                                      StoragePolicy,
-                                                      Allocator,
-                                                      Dispatcher_type
-                                                    >;
+  using DispatchPolicy  = typename DispatchTyper::template type<callable>;
+  using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
+                                                   void, void*, bool*, bool*>;
+  using WorkStorage_type =
+      RAJA::detail::WorkStorage<StoragePolicy, Allocator, Dispatcher_type>;
   using WorkStruct_type = typename WorkStorage_type::value_type;
 
-  const Dispatcher_type* dispatcher = RAJA::detail::get_Dispatcher<
-      callable, Dispatcher_type>(RAJA::seq_work{});
+  const Dispatcher_type* dispatcher =
+      RAJA::detail::get_Dispatcher<callable, Dispatcher_type>(
+          RAJA::seq_work {});
 
   {
-    auto test_empty = [&](WorkStorage_type& container) {
-
+    auto test_empty = [&](WorkStorage_type& container)
+    {
       ASSERT_EQ(container.size(), (size_t)(0));
       ASSERT_EQ(container.storage_size(), (size_t)0);
     };
 
-    auto fill_contents = [&](WorkStorage_type& container, double init_val) {
-
+    auto fill_contents = [&](WorkStorage_type& container, double init_val)
+    {
       callable c(init_val);
 
       ASSERT_FALSE(c.move_constructed);
@@ -67,17 +62,18 @@ void testWorkGroupWorkStorageInsertCall()
       ASSERT_TRUE(container.storage_size() >= sizeof(callable));
     };
 
-    auto test_contents = [&](WorkStorage_type& container, double init_val) {
-
+    auto test_contents = [&](WorkStorage_type& container, double init_val)
+    {
       ASSERT_EQ(container.size(), (size_t)1);
       ASSERT_TRUE(container.storage_size() >= sizeof(callable));
 
       auto iter = container.begin();
 
-      double test_val = -1;
+      double test_val       = -1;
       bool move_constructed = false;
-      bool moved_from = true;
-      WorkStruct_type::host_call(&*iter, (void*)&test_val, &move_constructed, &moved_from);
+      bool moved_from       = true;
+      WorkStruct_type::host_call(&*iter, (void*)&test_val, &move_constructed,
+                                 &moved_from);
 
       ASSERT_EQ(test_val, init_val);
       ASSERT_TRUE(move_constructed);
@@ -85,7 +81,7 @@ void testWorkGroupWorkStorageInsertCall()
     };
 
 
-    WorkStorage_type container(Allocator{});
+    WorkStorage_type container(Allocator {});
 
     test_empty(container);
 
@@ -102,14 +98,14 @@ void testWorkGroupWorkStorageInsertCall()
     test_contents(container2, 1.23456789);
 
 
-    WorkStorage_type container3(Allocator{});
+    WorkStorage_type container3(Allocator {});
     container3 = std::move(container2);
 
     test_empty(container2);
     test_contents(container3, 1.23456789);
 
 
-    WorkStorage_type container4(Allocator{});
+    WorkStorage_type container4(Allocator {});
 
     fill_contents(container4, 2.34567891);
     test_contents(container4, 2.34567891);
@@ -126,19 +122,19 @@ void testWorkGroupWorkStorageInsertCall()
 
 template <typename T>
 class WorkGroupBasicWorkStorageInsertCallUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageInsertCallUnitTest);
 
 
-TYPED_TEST_P(WorkGroupBasicWorkStorageInsertCallUnitTest, BasicWorkGroupWorkStorageInsertCall)
+TYPED_TEST_P(WorkGroupBasicWorkStorageInsertCallUnitTest,
+             BasicWorkGroupWorkStorageInsertCall)
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<2>>::type;
+  using Allocator     = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  testWorkGroupWorkStorageInsertCall< StoragePolicy, DispatchTyper, Allocator >();
+  testWorkGroupWorkStorageInsertCall<StoragePolicy, DispatchTyper, Allocator>();
 }
 
 #endif  //__TEST_WORKGROUP_WORKSTORAGEINSERTCALL__
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp
index 90cc7c1368..58206f5d90 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp
@@ -20,10 +20,7 @@
 #include <cstddef>
 
 
-template <typename StoragePolicy,
-          typename DispatchTyper,
-          typename Allocator
-          >
+template <typename StoragePolicy, typename DispatchTyper, typename Allocator>
 void testWorkGroupWorkStorageIterator()
 {
   bool success = true;
@@ -31,23 +28,21 @@ void testWorkGroupWorkStorageIterator()
   using callable = TestCallable<int>;
 
   static constexpr auto platform = RAJA::Platform::host;
-  using DispatchPolicy = typename DispatchTyper::template type<callable>;
-  using Dispatcher_type = RAJA::detail::Dispatcher<
-      platform, DispatchPolicy, void, void*, bool*, bool*>;
-  using WorkStorage_type = RAJA::detail::WorkStorage<
-                                                      StoragePolicy,
-                                                      Allocator,
-                                                      Dispatcher_type
-                                                    >;
+  using DispatchPolicy  = typename DispatchTyper::template type<callable>;
+  using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
+                                                   void, void*, bool*, bool*>;
+  using WorkStorage_type =
+      RAJA::detail::WorkStorage<StoragePolicy, Allocator, Dispatcher_type>;
 
 
-  const Dispatcher_type* dispatcher = RAJA::detail::get_Dispatcher<
-      callable, Dispatcher_type>(RAJA::seq_work{});
+  const Dispatcher_type* dispatcher =
+      RAJA::detail::get_Dispatcher<callable, Dispatcher_type>(
+          RAJA::seq_work {});
 
   {
-    WorkStorage_type container(Allocator{});
+    WorkStorage_type container(Allocator {});
 
-    ASSERT_EQ(container.end()-container.begin(), (std::ptrdiff_t)0);
+    ASSERT_EQ(container.end() - container.begin(), (std::ptrdiff_t)0);
     ASSERT_FALSE(container.begin() < container.end());
     ASSERT_FALSE(container.begin() > container.end());
     ASSERT_TRUE(container.begin() == container.end());
@@ -55,9 +50,9 @@ void testWorkGroupWorkStorageIterator()
     ASSERT_TRUE(container.begin() <= container.end());
     ASSERT_TRUE(container.begin() >= container.end());
 
-    container.template emplace<callable>(dispatcher, callable{-1});
+    container.template emplace<callable>(dispatcher, callable {-1});
 
-    ASSERT_EQ(container.end()-container.begin(), (std::ptrdiff_t)1);
+    ASSERT_EQ(container.end() - container.begin(), (std::ptrdiff_t)1);
     ASSERT_TRUE(container.begin() < container.end());
     ASSERT_FALSE(container.begin() > container.end());
     ASSERT_FALSE(container.begin() == container.end());
@@ -75,12 +70,12 @@ void testWorkGroupWorkStorageIterator()
       ASSERT_EQ(++iter, container.end());
       ASSERT_EQ(--iter, container.begin());
 
-      ASSERT_EQ(iter+1, container.end());
-      ASSERT_EQ(1+iter, container.end());
+      ASSERT_EQ(iter + 1, container.end());
+      ASSERT_EQ(1 + iter, container.end());
       ASSERT_EQ(++iter, container.end());
-      ASSERT_EQ(iter-1, container.begin());
-      ASSERT_EQ(iter-=1, container.begin());
-      ASSERT_EQ(iter+=1, container.end());
+      ASSERT_EQ(iter - 1, container.begin());
+      ASSERT_EQ(iter -= 1, container.begin());
+      ASSERT_EQ(iter += 1, container.end());
     }
   }
 
@@ -90,19 +85,19 @@ void testWorkGroupWorkStorageIterator()
 
 template <typename T>
 class WorkGroupBasicWorkStorageIteratorUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageIteratorUnitTest);
 
 
-TYPED_TEST_P(WorkGroupBasicWorkStorageIteratorUnitTest, BasicWorkGroupWorkStorageIterator)
+TYPED_TEST_P(WorkGroupBasicWorkStorageIteratorUnitTest,
+             BasicWorkGroupWorkStorageIterator)
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<2>>::type;
+  using Allocator     = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  testWorkGroupWorkStorageIterator< StoragePolicy, DispatchTyper, Allocator >();
+  testWorkGroupWorkStorageIterator<StoragePolicy, DispatchTyper, Allocator>();
 }
 
 #endif  //__TEST_WORKGROUP_WORKSTORAGEITERATOR__
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp
index 103829be0b..49fe1a4d60 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp
@@ -20,12 +20,10 @@
 #include <cstddef>
 
 
-template <typename StoragePolicy,
-          typename DispatchTyper,
-          typename Allocator
-          >
-void testWorkGroupWorkStorageMultiple(
-    const size_t num0, const size_t num1, const size_t num2)
+template <typename StoragePolicy, typename DispatchTyper, typename Allocator>
+void testWorkGroupWorkStorageMultiple(const size_t num0,
+                                      const size_t num1,
+                                      const size_t num2)
 {
   bool success = true;
 
@@ -33,20 +31,25 @@ void testWorkGroupWorkStorageMultiple(
   using type1 = TestArray<double, 6>;
   using type2 = TestArray<double, 14>;
 
-  auto make_type0 = [](double init_val, size_t i) {
+  auto make_type0 = [](double init_val, size_t i)
+  {
     type0 obj(init_val - (double)i);
     return obj;
   };
-  auto make_type1 = [](double init_val, size_t i) {
-    type1 obj{};
-    for (size_t j = 0; j < 6; ++j) {
+  auto make_type1 = [](double init_val, size_t i)
+  {
+    type1 obj {};
+    for (size_t j = 0; j < 6; ++j)
+    {
       obj[j] = init_val + 10.0 * j + i;
     }
     return obj;
   };
-  auto make_type2 = [](double init_val, size_t i) {
-    type2 obj{};
-    for (size_t j = 0; j < 14; ++j) {
+  auto make_type2 = [](double init_val, size_t i)
+  {
+    type2 obj {};
+    for (size_t j = 0; j < 14; ++j)
+    {
       obj[j] = init_val + 10.0 * j + i;
     }
     return obj;
@@ -57,89 +60,95 @@ void testWorkGroupWorkStorageMultiple(
   using callable2 = TestCallable<type2>;
 
   static constexpr auto platform = RAJA::Platform::host;
-  using DispatchPolicy = typename DispatchTyper::template type<callable0, callable1, callable2>;
-  using Dispatcher_type = RAJA::detail::Dispatcher<
-      platform, DispatchPolicy, void, void*, bool*, bool*>;
-  using WorkStorage_type = RAJA::detail::WorkStorage<
-                                                      StoragePolicy,
-                                                      Allocator,
-                                                      Dispatcher_type
-                                                    >;
+  using DispatchPolicy =
+      typename DispatchTyper::template type<callable0, callable1, callable2>;
+  using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
+                                                   void, void*, bool*, bool*>;
+  using WorkStorage_type =
+      RAJA::detail::WorkStorage<StoragePolicy, Allocator, Dispatcher_type>;
   using WorkStruct_type = typename WorkStorage_type::value_type;
 
 
-  const Dispatcher_type* dispatcher0 = RAJA::detail::get_Dispatcher<
-      callable0, Dispatcher_type>(RAJA::seq_work{});
-  const Dispatcher_type* dispatcher1 = RAJA::detail::get_Dispatcher<
-      callable1, Dispatcher_type>(RAJA::seq_work{});
-  const Dispatcher_type* dispatcher2 = RAJA::detail::get_Dispatcher<
-      callable2, Dispatcher_type>(RAJA::seq_work{});
+  const Dispatcher_type* dispatcher0 =
+      RAJA::detail::get_Dispatcher<callable0, Dispatcher_type>(
+          RAJA::seq_work {});
+  const Dispatcher_type* dispatcher1 =
+      RAJA::detail::get_Dispatcher<callable1, Dispatcher_type>(
+          RAJA::seq_work {});
+  const Dispatcher_type* dispatcher2 =
+      RAJA::detail::get_Dispatcher<callable2, Dispatcher_type>(
+          RAJA::seq_work {});
 
   {
-    auto test_empty = [&](WorkStorage_type& container) {
-
+    auto test_empty = [&](WorkStorage_type& container)
+    {
       ASSERT_EQ(container.size(), (size_t)(0));
       ASSERT_EQ(container.storage_size(), (size_t)0);
     };
 
-    auto fill_contents = [&](WorkStorage_type& container, double init_val0, double init_val1, double init_val2) {
-
+    auto fill_contents = [&](WorkStorage_type& container, double init_val0,
+                             double init_val1, double init_val2)
+    {
       std::vector<callable0> vec0;
       vec0.reserve(num0);
-      for (size_t i = 0; i < num0; ++i) {
+      for (size_t i = 0; i < num0; ++i)
+      {
         vec0.emplace_back(make_type0(init_val0, i));
         ASSERT_FALSE(vec0[i].move_constructed);
         ASSERT_FALSE(vec0[i].moved_from);
         container.template emplace<callable0>(dispatcher0, std::move(vec0[i]));
         ASSERT_FALSE(vec0[i].move_constructed);
-        ASSERT_TRUE (vec0[i].moved_from);
+        ASSERT_TRUE(vec0[i].moved_from);
       }
 
       std::vector<callable1> vec1;
       vec1.reserve(num1);
-      for (size_t i = 0; i < num1; ++i) {
+      for (size_t i = 0; i < num1; ++i)
+      {
         vec1.emplace_back(make_type1(init_val1, i));
         ASSERT_FALSE(vec1[i].move_constructed);
         ASSERT_FALSE(vec1[i].moved_from);
         container.template emplace<callable1>(dispatcher1, std::move(vec1[i]));
         ASSERT_FALSE(vec1[i].move_constructed);
-        ASSERT_TRUE (vec1[i].moved_from);
+        ASSERT_TRUE(vec1[i].moved_from);
       }
 
       std::vector<callable2> vec2;
       vec2.reserve(num2);
-      for (size_t i = 0; i < num2; ++i) {
+      for (size_t i = 0; i < num2; ++i)
+      {
         vec2.emplace_back(make_type2(init_val2, i));
         ASSERT_FALSE(vec2[i].move_constructed);
         ASSERT_FALSE(vec2[i].moved_from);
         container.template emplace<callable2>(dispatcher2, std::move(vec2[i]));
         ASSERT_FALSE(vec2[i].move_constructed);
-        ASSERT_TRUE (vec2[i].moved_from);
+        ASSERT_TRUE(vec2[i].moved_from);
       }
 
-      ASSERT_EQ(container.size(), num0+num1+num2);
-      ASSERT_GE(container.storage_size(),
-          num0*sizeof(callable0) +
-          num1*sizeof(callable1) +
-          num2*sizeof(callable2));
+      ASSERT_EQ(container.size(), num0 + num1 + num2);
+      ASSERT_GE(container.storage_size(), num0 * sizeof(callable0) +
+                                              num1 * sizeof(callable1) +
+                                              num2 * sizeof(callable2));
     };
 
-    auto test_contents = [&](WorkStorage_type& container, double init_val0, double init_val1, double init_val2) {
-
-      ASSERT_EQ(container.size(), num0+num1+num2);
-      ASSERT_GE(container.storage_size(),
-          num0*sizeof(callable0) +
-          num1*sizeof(callable1) +
-          num2*sizeof(callable2));
+    auto test_contents = [&](WorkStorage_type& container, double init_val0,
+                             double init_val1, double init_val2)
+    {
+      ASSERT_EQ(container.size(), num0 + num1 + num2);
+      ASSERT_GE(container.storage_size(), num0 * sizeof(callable0) +
+                                              num1 * sizeof(callable1) +
+                                              num2 * sizeof(callable2));
 
       {
         auto iter = container.begin();
 
-        for (size_t i = 0; i < num0; ++i) {
-          type0 val{};
+        for (size_t i = 0; i < num0; ++i)
+        {
+          type0 val {};
           bool move_constructed = false;
-          bool moved_from = true;
-          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed, &moved_from);
+          bool moved_from       = true;
+          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed,
+                                     &moved_from);
 
           type0 expected = make_type0(init_val0, i);
           ASSERT_EQ(val, expected);
@@ -149,11 +158,13 @@ void testWorkGroupWorkStorageMultiple(
           ++iter;
         }
 
-        for (size_t i = 0; i < num1; ++i) {
-          type1 val{};
+        for (size_t i = 0; i < num1; ++i)
+        {
+          type1 val {};
           bool move_constructed = false;
-          bool moved_from = true;
-          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed, &moved_from);
+          bool moved_from       = true;
+          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed,
+                                     &moved_from);
 
           type1 expected = make_type1(init_val1, i);
           ASSERT_EQ(val, expected);
@@ -163,11 +174,13 @@ void testWorkGroupWorkStorageMultiple(
           ++iter;
         }
 
-        for (size_t i = 0; i < num2; ++i) {
-          type2 val{};
+        for (size_t i = 0; i < num2; ++i)
+        {
+          type2 val {};
           bool move_constructed = false;
-          bool moved_from = true;
-          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed, &moved_from);
+          bool moved_from       = true;
+          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed,
+                                     &moved_from);
 
           type2 expected = make_type2(init_val2, i);
           ASSERT_EQ(val, expected);
@@ -181,7 +194,7 @@ void testWorkGroupWorkStorageMultiple(
       }
     };
 
-    WorkStorage_type container(Allocator{});
+    WorkStorage_type container(Allocator {});
 
     test_empty(container);
     fill_contents(container, 1.0, 100.0, 1000.0);
@@ -199,14 +212,14 @@ void testWorkGroupWorkStorageMultiple(
     test_contents(container2, 1.0, 100.0, 1000.0);
 
 
-    WorkStorage_type container3(Allocator{});
+    WorkStorage_type container3(Allocator {});
     container3 = std::move(container2);
 
     test_empty(container2);
     test_contents(container3, 1.0, 100.0, 1000.0);
 
 
-    WorkStorage_type container4(Allocator{});
+    WorkStorage_type container4(Allocator {});
 
     fill_contents(container4, 1.5, 100.5, 1000.5);
     test_contents(container4, 1.5, 100.5, 1000.5);
@@ -215,7 +228,6 @@ void testWorkGroupWorkStorageMultiple(
 
     test_empty(container3);
     test_contents(container4, 1.0, 100.0, 1000.0);
-
   }
 
   ASSERT_TRUE(success);
@@ -224,22 +236,22 @@ void testWorkGroupWorkStorageMultiple(
 
 template <typename T>
 class WorkGroupBasicWorkStorageMultipleUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageMultipleUnitTest);
 
 
-TYPED_TEST_P(WorkGroupBasicWorkStorageMultipleUnitTest, BasicWorkGroupWorkStorageMultiple)
+TYPED_TEST_P(WorkGroupBasicWorkStorageMultipleUnitTest,
+             BasicWorkGroupWorkStorageMultiple)
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<2>>::type;
+  using Allocator     = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  std::mt19937 rng(std::random_device{}());
+  std::mt19937 rng(std::random_device {}());
   std::uniform_int_distribution<size_t> dist(0, 128);
 
-  testWorkGroupWorkStorageMultiple< StoragePolicy, DispatchTyper, Allocator >(
+  testWorkGroupWorkStorageMultiple<StoragePolicy, DispatchTyper, Allocator>(
       dist(rng), dist(rng), dist(rng));
 }